def test_merge_index_val(): d = { 'cowq': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical(["3", "4", "5", "6", "0"]), 'col3': ['q', '2', 'c', '4', 'x'], 'date': pd.Series([ '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012' ], dtype='datetime64[ns]') } f = data.Frame(d) op = ReplaceValues() op.addInputShape(f.shape, 0) op.setOptions(table={1: { 'values': '3 4; 6 0', 'replace': 'h; nan' }}, inverted=False) s = f.shape.clone() os = op.getOutputShape() assert f.shape.colTypes[1] == Types.Nominal == os.colTypes[1] assert os == s g = op.execute(f) assert g.shape == f.shape assert nan_to_None(data.Frame(g.getRawFrame()['col2']).to_dict()) == \ {'col2': ["h", "h", "5", None, None]}
def test_join_on_index(): d = {'col1': ['1', '2', '3', '4', '10'], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']} e = {'col2': pd.Categorical(['3', '4', '5', '6', '0'], ordered=True), 'cowq': [1, 2, 3, 4.0, 10], 'col3': ['q', '2', 'c', '4', 'x'], 'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} f = data.Frame(d) g = data.Frame(e) f = f.setIndex('col1') g = g.setIndex('col2') defaultOpts = '_l', '_r', True, None, None, jt.Left op = Join() assert op.getOptions() == defaultOpts assert op.getOutputShape() is None # with pytest.raises(exc.OptionValidationError) as e: # op.setOptions('_ll', '_rr', True, None, None, jt.Inner) # CAN set options before shapes # assert 'shape' in [a[0] for a in e.value.invalid] # assert op.getOptions() == defaultOpts assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(g.shape, 1) # Now set options op.setOptions('_ll', '_rr', True, None, None, jt.Inner) assert op.getOptions() == ( '_ll', '_rr', True, None, None, jt.Inner ) dc = { 'cowq': Types.Numeric, 'col2': Types.Numeric, 'col3_ll': Types.String, 'col3_rr': Types.String, 'date_ll': Types.String, 'date_rr': Types.Datetime } # Note that join does not preserve index name di = { 'Unnamed': IndexType(Types.String) } s = data.Shape.fromDict(dc, di) assert op.getOutputShape() == s h = op.execute(f, g) assert h.shape == s
def test_join_on_multiindex(): d = {'col1': ['1', '2', '3', '4', '10'], 'col2': ['3', '4', '5', '6', '0'], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']} e = {'col2': pd.Categorical(['3', '4', '5', '6', '0'], ordered=True), 'cowq': [1, 2, 3, 4.0, 10], 'col3': ['q', '2', 'c', '4', 'x'], 'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} f = data.Frame(d) g = data.Frame(e) f = f.setIndex(['col1', 'col2']) # String, String g = g.setIndex(['col2', 'cowq']) # Category, Numeric defaultOpts = '_l', '_r', True, None, None, jt.Left op = Join() assert op.getOptions() == defaultOpts assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(g.shape, 1) # Now set options op.setOptions('_ll', '_rr', True, None, None, jt.Outer) assert op.getOptions() == ( '_ll', '_rr', True, None, None, jt.Outer ) dc = { 'col3_ll': Types.String, 'col3_rr': Types.String, 'date_ll': Types.String, 'date_rr': Types.Datetime } # Join on multiindex is different di = { 'col1': IndexType(Types.String), 'col2': IndexType(Types.String), 'cowq': IndexType(Types.Numeric) } s = data.Shape.fromDict(dc, di) assert op.getOutputShape() == s h = op.execute(f, g) assert h.shape == s
def execute(self, dfl: data.Frame, dfr: data.Frame) -> data.Frame: if self.__onIndex: # Join on indexes return data.Frame(dfl.getRawFrame().join(dfr.getRawFrame(), how=self.__type.value, lsuffix=self.__lSuffix, rsuffix=self.__rSuffix)) else: # Join (merge) on columns # onleft and onright must be set suffixes = (self.__lSuffix, self.__rSuffix) l_col = dfl.shape.colNames[self.__leftOn] r_col = dfr.shape.colNames[self.__rightOn] return data.Frame(dfl.getRawFrame().merge(dfr.getRawFrame(), how=self.__type.value, left_on=l_col, right_on=r_col, suffixes=suffixes))
def appendEmptyRow(self) -> bool: row = self.rowCount() # Create a dummy entry f = FrameModelMock(data.Frame(), ' ') self.__workbench.append(f) self.__nameToIndex[f.name] = row return True
def test_standardScale_1_attr_all_nan(): d = { 'id1': ['ab', 'sa', '121', '121', 'a'], 'id2': [1, np.nan, 0, 44, 0], 'col1': [1.0, -1.1, 3.0, 7.5, 10.0], 'col2': [np.nan, np.nan, np.nan, np.nan, np.nan], 'ww': [3, np.nan, 'ww', '1', '1'] } f = data.Frame(d) f = f.setIndex(['id1', 'id2']) op = StandardScaler() assert op.getOutputShape() is None op.setOptions(attributes={1: None}) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) s = f.shape.clone() assert op.getOutputShape() == s op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) g = op.execute(f) expected = { 'col1': d['col1'], 'col2': [((x - np.nanmean(d['col2'])) / np.nanstd(d['col2'])) for x in d['col2']], 'ww': [3, None, 'ww', '1', '1'] } assert nan_to_None(roundValues(g.to_dict(), 4)) == nan_to_None(roundValues(expected, 4)) assert g.shape == s
def execute(self, df: data.Frame) -> data.Frame: frame = copy.deepcopy(df) f = frame.getRawFrame() # Operation ignores nan values nanRows = f.iloc[:, list(self.__attributes.keys())].isnull() # For every column, transform every non-nan row columns = f.columns edges: Dict[int, List[float]] = dict() for col, k in self.__attributes.items(): colName = columns[col] notNa = (~nanRows.loc[:, colName]).to_list() discretizer = skp.KBinsDiscretizer(n_bins=k, encode='ordinal', strategy=self.__strategy.value) # Discretize and convert to string (since categories are strings) result = discretizer.fit_transform(f.loc[notNa, colName].values.reshape(-1, 1)).astype(str) name: str = colName if self.__attributeSuffix: # Make a new column with all nans name = colName + self.__attributeSuffix f.loc[:, name] = np.nan # Assign column f.loc[notNa, [name]] = result f.loc[:, name] = f[name].astype( pd.CategoricalDtype(categories=[str(float(i)) for i in range(k)], ordered=True)) edges[col] = discretizer.bin_edges_[0].tolist() # Log what has been done self.__logExecution(columns, edges) return data.Frame(f)
def test_nan_removerows_byperc(): d = { 'col1': [1, 2, 3, np.nan, 10], 'col2': [3, 4, np.nan, np.nan, np.nan], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] } f = data.Frame(d) op = RemoveNanRows() assert op.getOutputShape() is None op.setOptions(number=12121, percentage=0.3) assert op.getOptions() == (0.3, None) op.addInputShape(f.shape, 0) s = f.shape.clone() assert op.getOutputShape() == s g = op.execute(f) assert g != f and g.shape == s assert g.nRows == 4 op.setOptions(percentage=0.5, number=1) g = op.execute(f) assert g == f and g.nRows == 5
def test_fillnan_ffill(): e = {'col1': [np.nan, 2, np.nan, 4, 10], 'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True), 'col3': ['q', '2', 'c', np.nan, np.nan], 'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} g = data.Frame(e) g = g.setIndex('col1') op = FillNan() assert op.getOutputShape() is None op.addInputShape(g.shape, 0) op.setOptions(selected={0: None, 1: None, 2: None}, fillMode='ffill') assert op.getOptions() == { 'selected': {0: None, 1: None, 2: None}, 'fillMode': 'ffill' } s = Shape() s.colNames = ['col3', 'col2', 'date'] s.colTypes = [Types.String, Types.Ordinal, Types.Datetime] s.index = ['col1'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s h = op.execute(g) assert h.shape == s assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == { 'col3': ['q', '2', 'c', 'c', 'c'], 'col2': ['3', '4', '4', '4', '0'], 'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1988-05-09' for t in e['date']] }
def test_merge_from_nan(): d = { 'cowq': [1, 2, None, 4.0, None], 'col2': pd.Categorical(["3", "4", "5", "6", "0"]), 'col3': ['q', '2', 'c', '4', 'x'] } f = data.Frame(d) op = ReplaceValues() op.addInputShape(f.shape, 0) op.setOptions(table={0: { 'values': 'Nan 2.0;4.0', 'replace': '-1;-2' }}, inverted=False) s = f.shape.clone() assert f.shape.colTypes[1] == Types.Nominal assert op.getOutputShape() == s g = op.execute(f) assert g.shape == f.shape ff = { 'cowq': [1.0, -1.0, -1.0, -2.0, -1.0], 'col2': ["3", "4", "5", "6", "0"], 'col3': ['q', '2', 'c', '4', 'x'] } assert nan_to_None(g.to_dict()) == ff
def test_merge_nan(): d = { 'cowq': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical(["3", "4", "5", "6", "0"]), 'col3': ['q', '2', 'c', '4', 'x'] } f = data.Frame(d) op = ReplaceValues() op.addInputShape(f.shape, 0) op.setOptions(table={ 1: { 'values': 'hello 2 6 0; 3', 'replace': 'NAN; nan' }, 0: { 'values': '2 4 10', 'replace': 'naN' } }, inverted=False) s = f.shape.clone() assert f.shape.colTypes[1] == Types.Nominal assert op.getOutputShape() == s g = op.execute(f) assert g.shape == f.shape ff = { 'cowq': [1, None, 3, None, None], 'col2': [None, "4", "5", None, None], 'col3': ['q', '2', 'c', '4', 'x'] } assert nan_to_None(g.to_dict()) == ff
def test_merge_category_inverted(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical(["3", "4", "5", "6", "0"]), 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] } f = data.Frame(d) op = ReplaceValues() op.addInputShape(f.shape, 0) op.setOptions(table={1: { 'values': '4 0; 0', 'replace': 'val; NAN' }}, inverted=True) s = f.shape.clone() assert op.getOutputShape() == s assert s.colTypes[1] == Types.Nominal g = op.execute(f) assert nan_to_None(g.to_dict()) == { 'col1': [1, 2, 3, 4.0, 10], 'col2': [None, None, None, None, "0"], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] } assert g != f and g.shape == s
def test_duplicate_columns(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', 2, 'q', 'q', 2], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] } f = data.Frame(d) f.setIndex('col1') op = DuplicateColumn() op.addInputShape(f.shape, 0) assert op.getOutputShape() is None opt = {'table': {0: {'rename': 'a name'}, 2: {'rename': 'new'}}} op.setOptions(**opt) assert op.getOptions() == opt copt = deepcopy(opt) opt['table'][0]['rename'] = 'newnn' assert op.getOptions() == copt and op.getOptions() != opt s = f.shape.clone() s.colNames.append('a name') s.colNames.append('new') s.colTypes.append(s.colTypes[0]) s.colTypes.append(s.colTypes[2]) assert op.getOutputShape() == s g = op.execute(f) assert g != f and g.shape == s
def execute(self, df: data.Frame) -> data.Frame: f = df.getRawFrame().copy(True) pairs: List[Tuple[int, str]] = list(self.__attributes.items()) names = [v[1] for v in pairs] indexes = [v[0] for v in pairs] f[names] = f.iloc[:, indexes] return data.Frame(f)
def test_exception(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] } f = data.Frame(d) op = ReplaceValues() op.addInputShape(f.shape, 0) with pytest.raises(exp.OptionValidationError): op.setOptions(table={1: { 'replace': '7; h', 'values': '3 4 5; 2' }}, inverted=False) with pytest.raises(exp.OptionValidationError): op.setOptions( table={1: { 'replace': '7; 8;1', 'values': '3 4 5; 2' }}, inverted=False)
def execute(self, df: data.Frame) -> data.Frame: columns = df.getRawFrame().columns.to_list() # Execute pdf = df.getRawFrame().copy(True) fr = set(self.__attributes.values()) if len(fr) == 1: # All ranges are the same, shortcut toProcess = pdf.iloc[:, list(self.__attributes.keys())] processedColNames = toProcess.columns scaled = minmax_scale(toProcess, feature_range=fr.pop(), axis=0, copy=True) processed = pd.DataFrame(scaled).set_index(pdf.index) processed.columns = processedColNames else: processed = dict() for k, fr in self.__attributes.items(): processed[columns[k]] = minmax_scale(pdf.iloc[:, k], feature_range=fr, axis=0, copy=True) processed = pd.DataFrame(processed).set_index(pdf.index) # Merge result with other columns preserving order pdf = pdf.drop(columns=processed.columns) result = pd.concat([pdf, processed], ignore_index=False, axis=1)[columns] return data.Frame(result)
def test_discretize_by_date_with_None(): d = {'col2': [3, 4, 5.1, 6, 0], 'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': [pd.Timestamp('05-09-1988'), pd.Timestamp('22-12-1994'), pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2012')], 'cold2': [pd.Timestamp('01-01-1950'), pd.Timestamp('22-12-1980'), pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2034')], 'cold_disc': [None, None, None, None, None] # test to see if it is removed } f = data.Frame(d) f = f.setIndex('col2') op = DateDiscretizer() op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None shapeDict = f.shape.columnsDict assert shapeDict['cold'] == Types.Datetime assert shapeDict['cold2'] == Types.Datetime intervals = [pd.Timestamp('01-01-1950'), pd.Timestamp('01-01-1970'), pd.Timestamp('01-01-1990'), pd.Timestamp('01-01-2010'), pd.Timestamp('01-01-2030')] op.setOptions(selected={ 1: {'ranges': (intervals, True, False), 'labels': ['50', '70', '80', 'now']}, 2: {'ranges': (intervals, True, True), 'labels': ['50', '70', '80', 'now']}}, suffix=(True, '_disc')) assert op.getOptions() == { 'selected': { 1: {'ranges': (intervals, True, False), 'labels': ['50', '70', '80', 'now']}, 2: {'ranges': (intervals, True, True), 'labels': ['50', '70', '80', 'now']}}, 'suffix': (True, '_disc') } shapeDict['cold_disc'] = Types.Ordinal shapeDict['cold2_disc'] = Types.Ordinal s = data.Shape.fromDict(shapeDict, f.shape.indexDict) assert op.getOutputShape() == s g = op.execute(f) assert g.shape == s output = nan_to_None(g.to_dict()) assert output == {'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': [pd.Timestamp('05-09-1988'), pd.Timestamp('22-12-1994'), pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2012')], 'cold2': [pd.Timestamp('01-01-1950'), pd.Timestamp('22-12-1980'), pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2034')], 'cold_disc': ['70', '80', '80', None, 'now'], 'cold2_disc': [None, '70', '80', None, None] } assert g.getRawFrame()['cold_disc'].cat.categories.to_list() == ['50', '70', '80', 'now'] assert g.getRawFrame()['cold_disc'].dtype.ordered is True assert g.getRawFrame()['cold2_disc'].cat.categories.to_list() == ['50', '70', '80', 'now'] assert g.getRawFrame()['cold2_disc'].dtype.ordered is True
def test_discretize_set_options_exceptions(): d = {'col2': [3, 4, 5.1, 6, 0], 'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': [pd.Timestamp('05-09-1988 13:45'), pd.Timestamp('22-12-1994 14:21'), pd.Timestamp('21-11-1995 11:50'), None, pd.Timestamp('12-12-2012 09:15')] } f = data.Frame(d) op = DateDiscretizer() op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None shapeDict = f.shape.columnsDict assert shapeDict['cold'] == Types.Datetime intervals = [pd.Timestamp('05-09-1988 07:00'), pd.Timestamp('20-12-1994 11:30'), pd.Timestamp('05-09-2000 14:20'), pd.Timestamp('01-09-2010 14:30'), pd.Timestamp('12-12-2012 09:14')] labels = ['early', 'middle', 'late', 'now'] with pytest.raises(exp.OptionValidationError) as e: op.setOptions(selected={ 2: {'ranges': (list(), True, True), 'labels': labels}}, suffix=(False, '_disc')) assert e.value.invalid[0][0] == 'bins' with pytest.raises(exp.OptionValidationError) as e: op.setOptions(selected={ 2: {'ranges': None, 'labels': labels}}, suffix=(False, '_disc')) assert e.value.invalid[0][0] == 'bins' with pytest.raises(exp.OptionValidationError) as e: op.setOptions(selected={ 2: {'ranges': (intervals, False, True), 'labels': labels[:1]}}, suffix=(False, '_disc')) assert e.value.invalid[0][0] == 'len' with pytest.raises(exp.OptionValidationError) as e: op.setOptions(selected={ 2: {'ranges': (intervals, False, True), 'labels': None}}, suffix=(False, '_disc')) assert e.value.invalid[0][0] == 'lab' with pytest.raises(exp.OptionValidationError) as e: op.setOptions(selected={ 2: {'ranges': (intervals, False, True), 'labels': ['a', 'b', 'c', 'a']}}, suffix=(False, '_disc')) assert e.value.invalid[0][0] == 'unique' with pytest.raises(exp.OptionValidationError) as e: op.setOptions(selected={ 2: {'ranges': (intervals, False, True), 'labels': labels}}, suffix=(True, '')) assert e.value.invalid[0][0] == 'suff' assert op.getOutputShape() is None
def execute(self, df: data.Frame) -> data.Frame: """ Set new names for columns """ names: List[str] = df.colnames for k, v in self.__names.items(): names[k] = v new_df = df.getRawFrame().copy(deep=False) new_df.columns = names return data.Frame(new_df)
def execute(self) -> None: def manipulateDf(s: pd.Series, lab: str, sName: str) -> pd.DataFrame: # Makes a dataframe with a value column named as the series and a time label column return s.to_frame(sName).assign(time=lab) allSeriesColumn: List[pd.DataFrame] = list() for seriesName, values in self.__series.items(): values: List[Tuple[ str, int, int]] # [ (frameName, attrIndex, timeLabelIndex) ] # List of frames to append frames: List[pd.DataFrame] = list( map( lambda tup: manipulateDf( self.workbench.getDataframeModelByName(tup[0]).frame. getRawFrame().iloc[:, tup[1]], self.__timeLabels[tup[ 2]], seriesName), values)) # Create a dataframe with two columns, one with the values of this series for every index # and 1 with the time label. A series column is index by Time and Index. In this way the # concatenation of all the series will be made correctly seriesColumn = pd.concat(frames, axis=0, join='outer') # Create a categorical ordinal index for time labels waves = pd.Index(seriesColumn['time'].unique(), name='time', dtype=pd.CategoricalDtype( ordered=True, categories=self.__timeLabels)) ids = seriesColumn.index.unique() # Set index to [id, time] seriesColumn = seriesColumn.set_index(['time'], drop=True, append=True) # Reindex to provide vales for every possible combination of [time, values] multiIndex: pd.MultiIndex = pd.MultiIndex.from_product( [ids, waves]) # Additionally sort indexes, otherwise concatenation drops index type seriesColumn = seriesColumn.reindex(multiIndex).sort_index( axis=0, ignore_index=False) allSeriesColumn.append(seriesColumn) # Concat all series in the same dataframe. Remove the 'time' column from index, # leaving only the original index (subject id) result = pd.concat(allSeriesColumn, axis=1, join='outer', ignore_index=False).reset_index(level='time', drop=False) # Result: # Index is set on the subject identifier # Column 'time' contains the names of the time axis (wave names or integers) # The other columns are named with the specified 'seriesName' and are the series values which # varies with time and index self._workbench.setDataframeByName(self.__outputName, data.Frame(result))
def execute(self) -> None: if not self.hasOptions(): raise exp.InvalidOptions('Options are not set') pd_df = pd.read_csv(self.__file, sep=self.__separator, index_col=False, usecols=self.__selectedColumns, chunksize=self.__splitByRowN) if self.__splitByRowN is not None: # pd_df is a chunk iterator for i, chunk in enumerate(pd_df): name: str = self.__wName + '_{:d}'.format(i) self._workbench.setDataframeByName(name, data.Frame(chunk)) # TOCHECK: this does not set a parent for the FrameModel (since workbench lives in # different thread) else: # entire dataframe is read self._workbench.setDataframeByName(self.__wName, data.Frame(pd_df))
def execute(self, df: data.Frame) -> data.Frame: f = df.getRawFrame().copy(True) columns = f.columns.to_list() for c, o in self.__attributes.items(): result = pd.cut(f.iloc[:, c], bins=o[0], labels=o[1], duplicates='drop') colName: str = columns[c] newColName: str = colName if not self.__attributeSuffix else colName + self.__attributeSuffix f.loc[:, newColName] = result return data.Frame(f)
def test_remove_column(): op = RemoveNanColumns() assert op.hasOptions() is False d = { 'col1': [1, 2, 3, np.nan, 10], 'col2': [3, 4, np.nan, np.nan, np.nan], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', None, '21-11-1995', '22-06-1994', '12-12-2012'] } # None is same as nan f = data.Frame(d) f = f.setIndex('col1') assert op.getOutputShape() is None op.setOptions(number=1, percentage=0.3) assert op.getOptions() == (0.3, None) op.addInputShape(f.shape, 0) g = op.execute(f) assert g != f and g.shape != f.shape assert g.nRows == 5 == f.nRows assert g.shape.colNames == ['col3', 'date'] assert g.shape.colTypes == [Types.String, Types.String] op.setOptions(percentage=None, number=3) g = op.execute(f) assert g == f and g.nRows == 5 assert g.shape == f.shape op.setOptions(percentage=None, number=0) # remove all cols with > 0 nan g = op.execute(f) assert g != f and g.nRows == 5 # Removes also date because of None assert g.to_dict() == {'col3': ['q', '2', 'c', '4', 'x']} assert g.shape.colTypes == [Types.String] op.setOptions(percentage=0.6, number=0) # remove nothing g = op.execute(f) assert g == f and g.nRows == 5 # Removes also date because of None assert g.shape == f.shape op.setOptions(percentage=0.59, number=0) # remove col2 g = op.execute(f) assert g != f and g.nRows == 5 # Removes also date because of None s = f.shape.clone() i = s.colNames.index('col2') del s.colTypes[i] del s.colNames[i] s.index = ['col1'] assert g.shape == s
def test_discretize_range_suffix(): d = { 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, None, 6, None], 'ww': [3, 1, 'ww', '1', '1'], 'col2_binss': [3, 1, 'ww', '1', '1'] } f = data.Frame(d) op = RangeDiscretizer() tOpts = {1: {'bins': [0, 2, 4, 7], 'labels': 'A B C'}} op.setOptions(table=tOpts, suffix=(True, '_binss')) tOpts[1] = {} assert op.getOptions() == { 'table': { 1: { 'bins': [0, 2, 4, 7], 'labels': 'A B C' } }, 'suffix': (True, '_binss') } op.addInputShape(f.shape, 0) s = f.shape.clone() s.colTypes[1] = Types.Numeric s.colTypes[3] = Types.Ordinal assert op.getOutputShape() == s g = op.execute(f) expected_output = { 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, None, 6, None], 'ww': [3, 1, 'ww', '1', '1'], 'col2_binss': ['B', 'B', None, 'C', None] # This column must replace the original duplicate } assert nan_to_None(g.to_dict()) == expected_output assert g.shape == s # Check that output is the same as with drop op.setOptions(table={1: { 'bins': [0, 2, 4, 7], 'labels': 'A B C' }}, suffix=(False, None)) o = op.execute(f) assert expected_output['col2_binss'] == nan_to_None(o.to_dict())['col2'] assert expected_output['col2'] != nan_to_None(o.to_dict())['col2'] assert expected_output['ww'] == nan_to_None(o.to_dict())['ww'] assert expected_output['col1'] == nan_to_None(o.to_dict())['col1'] assert expected_output['col2_binss'] != nan_to_None( o.to_dict())['col2_binss']
def test_discretize_by_date_and_time(): d = {'col2': [3, 4, 5.1, 6, 0], 'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': [pd.Timestamp('05-09-1988 13:45'), pd.Timestamp('22-12-1994 14:21'), pd.Timestamp('21-11-1995 11:50'), None, pd.Timestamp('12-12-2012 09:15')], 'cold2': [pd.Timestamp('05-09-1988 13:45'), pd.Timestamp('22-12-1994 14:21'), pd.Timestamp('21-11-1995 11:50'), None, pd.Timestamp('12-12-2012 09:15')] } f = data.Frame(d) f = f.setIndex(['col2', 'col3']) op = DateDiscretizer() op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None shapeDict = f.shape.columnsDict assert shapeDict['cold'] == Types.Datetime intervals = [pd.Timestamp('05-09-1988 07:00'), pd.Timestamp('20-12-1994 11:30'), pd.Timestamp('05-09-2000 14:20'), pd.Timestamp('01-09-2010 14:30'), pd.Timestamp('12-12-2012 09:14')] labels = ['early mo', 'middle', 'late', 'now'] op.setOptions(selected={ 0: {'ranges': (intervals, True, True), 'labels': labels}, 1: {'ranges': (intervals, True, True), 'labels': labels}}, suffix=(False, '_disc')) assert op.getOptions() == { 'selected': { 0: {'ranges': (intervals, True, True), 'labels': labels}, 1: {'ranges': (intervals, True, True), 'labels': labels}}, 'suffix': (False, None) } shapeDict['cold'] = Types.Ordinal shapeDict['cold2'] = Types.Ordinal s = data.Shape.fromDict(shapeDict, f.shape.indexDict) assert op.getOutputShape() == s g = op.execute(f) assert g.shape == s assert g.shape != f.shape output = nan_to_None(g.to_dict()) assert output == { 'cold': ['early mo', 'middle', 'middle', None, None], 'cold2': ['early mo', 'middle', 'middle', None, None]} assert g.getRawFrame()['cold'].cat.categories.to_list() == ['early mo', 'middle', 'late', 'now'] assert g.getRawFrame()['cold'].dtype.ordered is True assert g.getRawFrame()['cold2'].cat.categories.to_list() == ['early mo', 'middle', 'late', 'now'] assert g.getRawFrame()['cold2'].dtype.ordered is True
def execute(self, df: data.Frame) -> data.Frame: # Deep copy raw_df = df.getRawFrame().copy(deep=True) # To string isNan = raw_df.iloc[:, self.__attributes].isnull() processed = raw_df.iloc[:, self.__attributes].astype(dtype=str, errors='raise') # Set to nan where values where nan processed = processed.mask(isNan, np.nan) colNames = df.shape.colNames raw_df.iloc[:, self.__attributes] = processed return data.Frame(raw_df)
def execute(self, df: data.Frame) -> data.Frame: # Assume everything to go is set if self.__thresholdPercentage is not None and self.__thresholdNumber is not None: raise exp.InvalidOptions('Can\'t have both threshold set') pf = df.getRawFrame().copy() if self.__thresholdPercentage: # By percentage pf = pf.loc[:, pf.isnull().mean() <= self.__thresholdPercentage] else: # By nan number pf = pf.loc[:, pf.isnull().sum() <= self.__thresholdNumber] return data.Frame(pf)
def test_minMaxScale(): d = { 'id1': ['ab', 'sa', '121', '121', 'a'], 'id2': [1, np.nan, 0, 44, 0], 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, np.nan, 6, np.nan], 'ww': [3, np.nan, 'ww', '1', '1'] } f = data.Frame(d) f = f.setIndex(['id1', 'id2']) op = MinMaxScaler() assert op.getOutputShape() is None op.setOptions(attributes={0: {'range': (-1, 1)}, 1: {'range': (2, 4)}}) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) s = f.shape.clone() assert op.getOutputShape() == s op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) g = op.execute(f) expected = { 'col1': [((x - min(d['col1'])) / (max(d['col1']) - min(d['col1']))) for x in d['col1']], 'col2': [((x - min(d['col2'])) / (max(d['col2']) - min(d['col2']))) for x in d['col2']], 'ww': [3, None, 'ww', '1', '1'] } expected = { 'col1': [x * (1 - (-1)) - 1 for x in expected['col1']], 'col2': [x * (4 - 2) + 2 for x in expected['col2']], 'ww': expected['ww'] } assert nan_to_None(roundValues(g.to_dict(), 4)) == nan_to_None(roundValues(expected, 4)) assert g.shape == s assert not numpy_equal(g.getRawFrame().values, f.getRawFrame().values) options = op.getOptions() assert options == { 'attributes': { 0: { 'range': (-1, 1) }, 1: { 'range': (2, 4) } } }
def test_join_on_cols(): d = {'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']} e = {'cowq': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical([3, 4, 5, 7, 0]), 'col3': ['q', '2', 'c', '4', 'x'], 'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} f = data.Frame(d) g = data.Frame(e) f = f.setIndex('col1') g = g.setIndex('col2') op = Join() assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(g.shape, 1) op.setOptions('_l', '_r', False, 2, 1, jt.Right) assert op.getOptions() == ('_l', '_r', False, 2, 1, jt.Right) dc = { 'cowq': Types.Numeric, 'col2': Types.Numeric, 'col3_l': Types.String, 'col3_r': Types.String, 'date_l': Types.String, 'date_r': Types.Datetime } # Note that merge does not preserve index di = { 'Unnamed': IndexType(Types.Numeric) # Default index } s = data.Shape.fromDict(dc, di) assert op.getOutputShape() == s h = op.execute(f, g) assert h.shape == s
def test_merge_numeric(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] } f = data.Frame(d) op = ReplaceValues() assert op.getOptions() == {'table': dict(), 'inverted': False} op.addInputShape(f.shape, 0) tOps = { 1: { 'values': '1.0 3.0 4.0; 6 0.0', 'replace': '-1.0;-2.0' }, 0: { 'values': '1.0 4.0', 'replace': '7.0' } } op.setOptions(table=tOps, inverted=False) dOps = op.getOptions() assert dOps == { 'table': { 1: { 'values': '1.0 3.0 4.0; 6.0 0.0', 'replace': '-1.0; -2.0' }, 0: { 'values': '1.0 4.0', 'replace': '7.0' } }, 'inverted': False } assert isDictDeepCopy(tOps, dOps['table']) s = f.shape.clone() assert op.getOutputShape() == s g = op.execute(f) assert g != f and g.shape == s assert g.to_dict() == { 'col1': [7.0, 2.0, 3.0, 7.0, 10.0], 'col2': [-1.0, -1.0, 5.0, -2.0, -2.0], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] }