def test_replace_input_formats_listlike(self): # both dicts to_rep = {'A': np.nan, 'B': 0, 'C': ''} values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) filled = df.replace(to_rep, values) expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) result = df.replace([0, 2, 5], [5, 2, 0]) expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0], 'C': ['', 'asdf', 'fd']}) assert_frame_equal(result, expected) # scalar to dict values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) filled = df.replace(np.nan, values) expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) # list to list to_rep = [np.nan, 0, ''] values = [-2, -1, 'missing'] result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], values[i], inplace=True) assert_frame_equal(result, expected) msg = r"Replacement lists must match in length\. Expecting 3 got 2" with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:])
def int_frame(): """ Fixture for DataFrame of ints with index of unique strings Columns are ['A', 'B', 'C', 'D'] A B C D vpBeWjM651 1 0 1 0 5JyxmrP1En -1 0 0 0 qEDaoD49U2 -1 1 0 0 m66TkTfsFe 0 0 0 0 EHPaNzEUFm -1 0 -1 0 fpRJCevQhi 2 0 0 0 OlQvnmfi3Q 0 0 -2 0 ... .. .. .. .. uB1FPlz4uP 0 0 0 1 EcSe6yNzCU 0 0 -1 0 L50VudaiI8 -1 1 -2 0 y3bpw4nwIp 0 -1 0 0 H0RdLLwrCT 1 1 0 0 rY82K0vMwm 0 0 0 0 1OPIUjnkjk 2 0 0 0 [30 rows x 4 columns] """ df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) # force these all to int64 to avoid platform testing issues return DataFrame({c: s for c, s in df.items()}, dtype=np.int64)
def _features_to_properties(features: pd.DataFrame) -> Dict[str, np.ndarray]: """Converts a features DataFrame to a deprecated properties dictionary. See Also -------- :meth:`_FeatureTable.properties` """ return {name: series.to_numpy() for name, series in features.items()}
def test_items(self): # GH 17213, GH 13918 cols = ["a", "b", "c"] df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols) for c, (k, v) in zip(cols, df.items()): assert c == k assert isinstance(v, Series) assert (df[k] == v).all()
def test_items(self): # GH 17213, GH 13918 cols = ['a', 'b', 'c'] df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols) for c, (k, v) in zip(cols, df.items()): assert c == k assert isinstance(v, Series) assert (df[k] == v).all()
def str_to_cats(dataframe: pd.DataFrame): """ Convert string column to categories """ for col_name, col in dataframe.items(): if is_string_dtype(col): dataframe[col_name] = col.astype('category').cat.as_ordered() return dataframe
def test_items(self): # issue #17213, #13918 cols = ['a', 'b', 'c'] df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols) for c, (k, v) in zip(cols, df.items()): assert c == k assert isinstance(v, Series) assert (df[k] == v).all()
def _test(model_split: dict, test_data: DataFrame, evaluation: bool, is_binary_classifier: bool) -> (DataFrame, float): clusterer = model_split[ModelType.CLUSTERER.value] classifier = model_split[ModelType.CLASSIFIER.value] test_data = clusterer.cluster_data(test_data) results_df = DataFrame() auc = 0 non_empty_clusters = clusterer.n_clusters for cluster in range(clusterer.n_clusters): cluster_test_df = test_data[cluster] if cluster_test_df.empty: non_empty_clusters -= 1 else: cluster_targets_df = cluster_test_df['label'] if evaluation: try: if hasattr(classifier[cluster], 'decision_function'): scores = classifier[cluster].decision_function( cluster_test_df.drop(['label'], 1)) else: scores = classifier[cluster].predict_proba( cluster_test_df.drop(['label'], 1)) if np.size(scores, 1) >= 2: # checks number of columns scores = scores[:, 1] except (NotImplementedError, KeyError): if hasattr(classifier[cluster], 'decision_function'): scores = classifier[cluster].decision_function( cluster_test_df.drop(['label'], 1).values) else: scores = classifier[cluster].predict_proba( cluster_test_df.drop(['label'], 1).values) try: if np.size(scores, 1) >= 2: # checks number of columns scores = scores[:, 1] except Exception as exception: pass auc += get_auc(cluster_targets_df, scores) try: cluster_test_df['predicted'] = classifier[cluster].predict( cluster_test_df.drop(['label'], 1)) except (NotImplementedError, KeyError): cluster_test_df['predicted'] = classifier[cluster].predict( cluster_test_df.drop(['label'], 1).values) results_df = results_df.append(cluster_test_df) if is_binary_classifier or max( [len(set(t['label'])) for _, t in test_data.items()]) <= 2: auc = float(auc) / non_empty_clusters else: pass # TODO: check if AUC is ok for multiclass, otherwise implement return results_df, auc
def _sample_dfs(t_df: pyspark.sql.DataFrame, t_fracs: pd.DataFrame, c_can_df: pyspark.sql.DataFrame, c_fracs: pd.DataFrame, match_col: str) -> Tuple[DataFrame, DataFrame]: r"""given treatment and control pops and their stratified sample fracs, return balanced pops Parameters ---------- t_df : pyspark.DataFrame treatment pop t_fracs: pd.DataFrame with columns `match_col` and 'treatment_scaled_sample_fraction' c_can_df : pyspark.DataFrame control can pop c_fracs : pd.DataFrame with columns `match_col` and control_scaled_sample_fraction Returns ------- t_out : pyspark.sql.DataFrame c_out : pyspark.sql.DataFrame Raises ------ UncaughtExceptions """ _persist_if_unpersisted(t_df) _persist_if_unpersisted(c_can_df) t_fracs = t_fracs.set_index( match_col).treatment_scaled_sample_fraction.to_dict() t_dict = {} for key, value in t_fracs.items(): t_dict[int(key)] = min(float(value), 1) t_out = t_df.sampleBy(col=match_col, fractions=t_dict, seed=42) c_fracs = c_fracs.set_index( match_col).control_scaled_sample_fraction.to_dict() c_dict = {} for key, value in c_fracs.items(): c_dict[int(key)] = float(value) c_out = c_can_df.sampleBy(col=match_col, fractions=c_dict, seed=42) return t_out, c_out
def make_mem_efficient_gene_arrays(calls: pd.DataFrame): names, values = zip(*calls.items()) namesA, namesB = zip(*itertools.combinations(names, r=2)) valuesA, valuesB = zip(*itertools.combinations(values, r=2)) return namesA, namesB, valuesA, valuesB
def int_frame(): """ Fixture for DataFrame of ints with index of unique strings Columns are ['A', 'B', 'C', 'D'] """ df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) # force these all to int64 to avoid platform testing issues return DataFrame({c: s for c, s in df.items()}, dtype=np.int64)
def get_dtypes_and_schemas_of_dataframe(dataframe: pd.DataFrame): dtypes_by_column_name = {} schema_type_hints_by_column_name = {} for column_name, column_values in dataframe.items(): dtypes_by_column_name[column_name], schema_type_hints_by_column_name[column_name] = \ get_dtype_and_schema_of_array(column_values) return dtypes_by_column_name, schema_type_hints_by_column_name
def plot_neff_vs_width(df: pd.DataFrame, **kwargs): width = df.width for mode_number, neff in df.items(): if mode_number != "width": plt.plot(width, neff, ".-", label=str(mode_number)) plt.legend(**kwargs) plt.xlabel("width (um)") plt.ylabel("neff")
def _unravel_gradient_covariance_matrix(cls, parameter_name, covariance_matrix: pd.DataFrame): return { cls._gradient_covariance_name.format(parameter_name=parameter_name, gate_name_1=gate_1, gate_name_2=gate_2): cov_entry for gate_1, cov_column in covariance_matrix.items() for gate_2, cov_entry in cov_column.items() }
def perf(data: pd.DataFrame) -> pd.DataFrame: """ Performance rebased to 100 """ return pd.DataFrame( pd.concat( [(srs.dropna().pct_change().fillna(0).add(1).cumprod().mul(100)) for _, srs in data.items()], axis=1))
def add_feature(src: pd.DataFrame, target: dict, prefix: str = None) -> None: if prefix is None: prefix = "" else: prefix += "_" for k, v in src.items(): target[str(prefix) + str(k)] = v
def test_replace_input_formats_listlike(self): # both dicts to_rep = {"A": np.nan, "B": 0, "C": ""} values = {"A": 0, "B": -1, "C": "missing"} df = DataFrame({ "A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"] }) filled = df.replace(to_rep, values) expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()} tm.assert_frame_equal(filled, DataFrame(expected)) result = df.replace([0, 2, 5], [5, 2, 0]) expected = DataFrame({ "A": [np.nan, 5, np.inf], "B": [5, 2, 0], "C": ["", "asdf", "fd"] }) tm.assert_frame_equal(result, expected) # scalar to dict values = {"A": 0, "B": -1, "C": "missing"} df = DataFrame({ "A": [np.nan, 0, np.nan], "B": [0, 2, 5], "C": ["", "asdf", "fd"] }) filled = df.replace(np.nan, values) expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()} tm.assert_frame_equal(filled, DataFrame(expected)) # list to list to_rep = [np.nan, 0, ""] values = [-2, -1, "missing"] result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], values[i], inplace=True) tm.assert_frame_equal(result, expected) msg = r"Replacement lists must match in length\. Expecting 3 got 2" with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:])
def test_replace_input_formats_listlike(self): # both dicts to_rep = {'A': np.nan, 'B': 0, 'C': ''} values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({ 'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd'] }) filled = df.replace(to_rep, values) expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) result = df.replace([0, 2, 5], [5, 2, 0]) expected = DataFrame({ 'A': [np.nan, 5, np.inf], 'B': [5, 2, 0], 'C': ['', 'asdf', 'fd'] }) assert_frame_equal(result, expected) # scalar to dict values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({ 'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd'] }) filled = df.replace(np.nan, values) expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) # list to list to_rep = [np.nan, 0, ''] values = [-2, -1, 'missing'] result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], values[i], inplace=True) assert_frame_equal(result, expected) msg = r"Replacement lists must match in length\. Expecting 3 got 2" with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:])
def test_apply_differently_indexed(): df = DataFrame(np.random.randn(20, 10)) result = df.apply(Series.describe, axis=0) expected = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns) tm.assert_frame_equal(result, expected) result = df.apply(Series.describe, axis=1) expected = DataFrame({i: v.describe() for i, v in df.T.items()}, columns=df.index).T tm.assert_frame_equal(result, expected)
def __createSeriesForAttributes(dataframe: pd.DataFrame, timeIndex: int, timeIndexType: Type) \ -> Tuple[List[QtCharts.QLineSeries], float, float]: """ Creates a QLineSeries for every column in the dataframe. 'timeIndex' column is used for xAxis :return: tuple as (list of series, yMin, yMax) """ timeIndexName: str = dataframe.columns[timeIndex] # Convert time values to their numerical equivalent if timeIndexType == Types.Datetime: # Time axis is Datetime, so convert every date into the number of ms from 01/01/1970 # dataframe[timeIndexName]: pd.Series[pd.Timestamp] # This may not be super accurate dataframe.loc[:, timeIndexName] = pd.to_numeric( dataframe[timeIndexName], downcast='integer', errors='coerce').values / (10**6) # dataframe[timeIndexName] \ # .map(lambda timestamp: int(timestamp.to_pydatetime().timestamp() * 1000) if ) else: # Types.Ordinal # dataframe[timeIndexName]: pd.Series[pd.Categorical] dataframe.loc[:, timeIndexName] = dataframe[ timeIndexName].cat.codes.to_list() timeValues: pd.Series = dataframe[timeIndexName].astype(float) # Remove time column since we already used it to create the time points dataframe = dataframe.drop(timeIndexName, axis=1) # Create series for every column (excluding time) allSeries: List[QtCharts.QLineSeries] = list() # Also keep track of the range the y axis should have yMin: float = None yMax: float = None for colName, valueSeries in dataframe.items(): valueSeries = pd.Series(valueSeries) if pd.api.types.is_categorical(valueSeries): # makes sure this is a series of floats valueSeries = valueSeries.cat.codes.astype(float) # Compute minimum and maximum of series and update global range smin = valueSeries.min() smax = valueSeries.max() yMin = smin if (yMin is None or yMin > smin) else yMin yMax = smax if (yMax is None or yMax < smax) else yMax # Create series qSeries = QtCharts.QLineSeries() points: List[QPointF] = list( map(lambda t: QPointF(*t), zip(timeValues, valueSeries))) qSeries.append(points) qSeries.setName(colName) qSeries.setUseOpenGL(True) qSeries.setPointsVisible( True) # This is ignored with OpenGL enabled allSeries.append(qSeries) return allSeries, yMin, yMax
def validate_predictions(predictions: pd.DataFrame): names = predictions.columns.values assert len( names ) >= 2, "predictions frame should have 2 columns (regression) or more (classification)" assert names[ -1] == "truth", "last column of predictions frame must be named `truth`" assert names[ -2] == "predictions", "last column of predictions frame must be named `predictions`" if len(names) == 2: # regression for name, col in predictions.items(): pd.to_numeric( col) # pandas will raise if we have non-numerical values else: # classification predictors = names[:-2] probabilities, preds, truth = predictions.iloc[:, : -2], predictions.iloc[:, -2], predictions.iloc[:, -1] assert np.array_equal( predictors, np.sort(predictors) ), "Predictors columns are not sorted in lexicographic order." assert set(np.unique(predictors)) == set( predictors ), "Predictions contain multiple columns with the same label." for name, col in probabilities.items(): pd.to_numeric( col) # pandas will raise if we have non-numerical values if _encode_predictions_and_truth_: assert np.array_equal(truth, truth.astype( int)), "Values in truth column are not encoded." assert np.array_equal(preds, preds.astype( int)), "Values in predictions column are not encoded." predictors_set = set(range(len(predictors))) validate_row = lambda r: r[:-2].astype(float).values.argmax( ) == r[-2] else: predictors_set = set(predictors) validate_row = lambda r: r[:-2].astype(float).idxmax() == r[-2] truth_set = set(truth.unique()) if predictors_set < truth_set: log.warning( "Truth column contains values unseen during training: no matching probability column." ) if predictors_set > truth_set: log.warning( "Truth column doesn't contain all the possible target values: the test dataset may be too small." ) predictions_set = set(preds.unique()) assert predictions_set <= predictors_set, "Predictions column contains unexpected values: {}.".format( predictions_set - predictors_set) assert predictions.apply(validate_row, axis=1).all( ), "Predictions don't always match the predictor with the highest probability."
def jCurrencyConversionChart(fromCurrency, toCurrencies, amount): fromCurrency = fromCurrency.upper() amount = float(amount) toCurrencies = [x.upper() for x in toCurrencies] result = {} for toCurrency in toCurrencies: result[toCurrency] = jConvertCurrency(fromCurrency,toCurrency,amount) result = DataFrame(result.items(), columns = ['to_currency','amount']) result['from_currency'] = fromCurrency return (result[['from_currency','to_currency','amount']])
def compute_matrix_from_columns( input_df: DataFrame, callback: Callable, remove_na: bool = True ) -> DataFrame: output_data = OrderedDict.fromkeys(input_df.columns) for feature1_name, feature_1_series in input_df.items(): output_data[feature1_name] = OrderedDict.fromkeys(input_df.columns) for feature2_name, feature_2_series in input_df.items(): if remove_na: mask = ~feature_1_series.isna() & ~feature_2_series.isna() else: mask = [True] * len(feature_1_series) output_data[feature1_name][feature2_name] = callback( feature_1_series[mask], feature_2_series[mask] ) return DataFrame(output_data, columns=input_df.columns, index=input_df.columns)
def kstest_dataframe(ror: pd.DataFrame, distr: str = 'norm') -> pd.DataFrame: """ Kolmogorov-Smirnov test for goodness of fit test on time series in form of Pandas DataFrame. Returns: (The test statistic, The p-value for the hypothesis test) """ test_dict = {} for label, content in ror.items(): test_values = Frame.kstest_series(content, distr=distr) test_dict.update({label: test_values}) return pd.DataFrame.from_dict(test_dict, orient='columns')
def save_pandas_to_lmdb(df: pd.DataFrame, lmdb_path: str, max_size: int = None): if max_size is None: max_size = df.memory_usage(deep=True).sum() * 1.5 max_size = int(max_size) df.reset_index(drop=True) df = df.to_dict(orient='index') env = lmdb.open(lmdb_path, map_size=max_size) with env.begin(write=True) as txn: for k, v in tqdm(df.items()): txn.put(str(k).encode(), msgpack.packb(v))
def tranform_columns_to_categorical( df: pd.DataFrame, ordered: Dict = dict()) -> pd.DataFrame: df = df.copy() for n, c in df.items(): if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered() if n in ordered: df[n] = df[n].cat.set_categories(ordered[n], ordered=True) return df
def cats_to_codes(dataframe: pd.DataFrame, max_n_cats: int = None): """ converst categories to ints :param dataframe: :param max_n_cats: converts categories to int only if there are less than max_n_cats different categories on the column :return: """ for col_name, col in dataframe.items(): if not is_numeric_dtype(col) and ( max_n_cats is None or len(col.cat.categories) > max_n_cats): dataframe[col_name] = pd.Categorical( col).codes + 1 # nulls are -1 so with +1 they are 0 return dataframe
def to_dict(x: pd.DataFrame, drop_na: bool = True) -> Union[list, dict]: """ Convert a pandas DataFrame to a row-wise list of dictionaries """ if isinstance(x, dict): pass if isinstance(x, pd.DataFrame): x = x.to_dict(orient='records') # Drop None values - these do not need te be send each time if drop_na: x = [{k: v for k, v in x.items() if not pd.isna(v)} for x in x] return x
def test_from_records_dictlike(self): # test the dict methods df = DataFrame( { "A": np.array(np.random.randn(6), dtype=np.float64), "A1": np.array(np.random.randn(6), dtype=np.float64), "B": np.array(np.arange(6), dtype=np.int64), "C": ["foo"] * 6, "D": np.array([True, False] * 3, dtype=bool), "E": np.array(np.random.randn(6), dtype=np.float32), "E1": np.array(np.random.randn(6), dtype=np.float32), "F": np.array(np.arange(6), dtype=np.int32), } ) # columns is in a different order here than the actual items iterated # from the dict blocks = df._to_dict_of_blocks() columns = [] for b in blocks.values(): columns.extend(b.columns) asdict = {x: y for x, y in df.items()} asdict2 = {x: y.values for x, y in df.items()} # dict of series & dict of ndarrays (have dtype info) results = [] results.append(DataFrame.from_records(asdict).reindex(columns=df.columns)) results.append( DataFrame.from_records(asdict, columns=columns).reindex(columns=df.columns) ) results.append( DataFrame.from_records(asdict2, columns=columns).reindex(columns=df.columns) ) for r in results: tm.assert_frame_equal(r, df)
def apply_cats(dataframe: pd.DataFrame, cats_dataframe: pd.DataFrame): """ apply the categories in cats_dataframe to dataframe :param dataframe: :param cats_dataframe: :return: """ for col_name, col in dataframe.items(): if (col_name in cats_dataframe.columns) and ( cats_dataframe[col_name].dtype.name == 'category'): dataframe[col_name] = col.astype('category').cat.as_ordered() dataframe[col_name] = col.cat.set_categories( cats_dataframe[col_name].car.categories, ordered=True) return dataframe
def test_apply_differently_indexed(self): df = DataFrame(np.random.randn(20, 10)) result0 = df.apply(Series.describe, axis=0) expected0 = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns) assert_frame_equal(result0, expected0) result1 = df.apply(Series.describe, axis=1) expected1 = DataFrame({i: v.describe() for i, v in df.T.items()}, columns=df.index).T assert_frame_equal(result1, expected1)
def infer_schema_from_df( df: pd.DataFrame, features, entities, timestamp_key: str = None, entity_columns=None, options: InferOptions = InferOptions.Null, ): """infer feature set schema from dataframe""" timestamp_fields = [] current_entities = list(entities.keys()) entity_columns = entity_columns or [] def upsert_entity(name, value_type): if name in current_entities: entities[name].value_type = value_type else: entities[name] = Entity(name=column, value_type=value_type) for column, series in df.items(): value_type = _get_column_type(series) is_entity = column in entity_columns or column in current_entities if is_entity: upsert_entity(column, value_type) elif ( InferOptions.get_common_options(options, InferOptions.Features) and column != timestamp_key ): if column in features.keys(): features[column].value_type = value_type else: features[column] = Feature(name=column, value_type=value_type) if value_type == "datetime" and not is_entity: timestamp_fields.append(column) if InferOptions.get_common_options(options, InferOptions.Index): # infer types of index fields if df.index.name: value_type = _get_column_type(df.index) upsert_entity(df.index.name, value_type) elif df.index.nlevels > 1: for level, name in zip(df.index.levels, df.index.names): value_type = _get_column_type(level) upsert_entity(name, value_type) if value_type == "datetime": timestamp_fields.append(name) if len(timestamp_fields) == 1 and not timestamp_key: return timestamp_fields[0] return timestamp_key
def get_common_processes_at_threshold(self, signif_gw_results: pd.DataFrame, affiliation_matrix: pd.DataFrame, threshold: float, go_type: str = "name"): x_common_processes = { k: pd.merge(df, GenewalkObj.min_x_shared_processes( affiliation_matrix, threshold), left_on="go_name", right_on=go_type) for k, df in signif_gw_results.items() } return x_common_processes
def get_numeric_features(data: pd.DataFrame) -> List[str]: """ A function to get all numeric features in a pandas dataframe # Parameters data: `pd.DataFrame`: A pandas Dataframe # Returns List[str]: All numeric columns in the input dataframe """ return [ feature for feature, values in data.items() if is_numeric_dtype(values) ]
def test_replace_input_formats_scalar(self): df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) # dict to scalar to_rep = {'A': np.nan, 'B': 0, 'C': ''} filled = df.replace(to_rep, 0) expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) msg = "value argument must be scalar, dict, or Series" with pytest.raises(TypeError, match=msg): df.replace(to_rep, [np.nan, 0, '']) # list to scalar to_rep = [np.nan, 0, ''] result = df.replace(to_rep, -1) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], -1, inplace=True) assert_frame_equal(result, expected)