def scale(x_train, x_test=None, list_of_cols=[], method="minmax", keep_col=False, **algo_kwargs): """ Scales data according to a specific method. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to If `list_of_cols` is not provided, the strategy will be applied to all numeric columns, by default [] method : str, optional Scaling method, by default 'minmax' keep_col : bool, optional True to not remove the columns, by default False algo_kwargs : optional Parmaters to pass into the scaler constructor from Scikit-Learn, by default {} Returns ------- Dataframe, *Dataframe Transformed dataframe with rows normalized. Returns 2 Dataframes if x_test is provided. """ list_of_cols = _numeric_input_conditions(list_of_cols, x_train) scaler = SCALER[method](**algo_kwargs) scaled_data = scaler.fit_transform(x_train[list_of_cols]) scaled_df = pd.DataFrame(scaled_data, columns=list_of_cols) x_train = drop_replace_columns(x_train, list_of_cols, scaled_df, keep_col=keep_col) if x_test is not None: scaled_x_test = scaler.transform(x_test[list_of_cols]) scaled_test_df = pd.DataFrame(scaled_x_test, columns=list_of_cols) x_test = drop_replace_columns(x_test, list_of_cols, scaled_test_df, keep_col=keep_col) return x_train, x_test
def replace_missing_mean_median_mode( x_train, x_test=None, list_of_cols=[], strategy="" ): """ Replaces missing values in every numeric column with the mean, median or mode of that column specified by strategy. Mean: Average value of the column. Effected by outliers. Median: Middle value of a list of numbers. Equal to the mean if x_train follows normal distribution. Not effected much by anomalies. Mode: Most common number in a list of numbers. Parameters ---------- x_train: Dataframe or array like - 2d Dataset x_test: Dataframe or array like - 2d Testing dataset, by default None. list_of_cols : list, optional A list of specific columns to apply this technique to If `list_of_cols` is not provided, the strategy will be applied to all numeric columns., by default [] strategy : str Strategy for replacing missing values. Can be either "mean", "median" or "most_frequent" Returns ------- Dataframe, *Dataframe Transformed dataframe with rows with a missing values in a specific column are missing Returns 2 Dataframes test if x_test is provided. """ if strategy != "most_frequent": list_of_cols = _numeric_input_conditions(list_of_cols, x_train) else: list_of_cols = _get_columns(list_of_cols, x_train) imp = SimpleImputer(strategy=strategy) fit_data = imp.fit_transform(x_train[list_of_cols]) fit_df = pd.DataFrame(fit_data, columns=list_of_cols) x_train = drop_replace_columns(x_train, list_of_cols, fit_df) if x_test is not None: fit_x_test = imp.transform(x_test[list_of_cols]) fit_test_df = pd.DataFrame(fit_x_test, columns=list_of_cols) x_test = drop_replace_columns(x_test, list_of_cols, fit_test_df) return x_train, x_test
def feature_tfidf(x_train, x_test=None, list_of_cols=[], keep_col=True, **algo_kwargs): """ Creates a matrix of the tf-idf score for every word in the corpus as it pertains to each document. Either the full data or training data plus testing data MUST be provided, not both. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to, by default [] keep_col : bool, optional True if you want to keep the columns passed, otherwise remove it. algo_kwargs : optional Parameters you would pass into TFIDF constructor, by default {} Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column Returns 2 Dataframes if x_test data is provided. """ enc = TfidfVectorizer(**algo_kwargs) list_of_cols = _get_columns(list_of_cols, x_train) for col in list_of_cols: enc_data = enc.fit_transform(x_train[col]).toarray() enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names()) x_train = drop_replace_columns(x_train, col, enc_df, keep_col) if x_test is not None: enc_x_test = enc.transform(x_test[col]).toarray() enc_test_df = pd.DataFrame(enc_x_test, columns=enc.get_feature_names()) x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col) return x_train, x_test
def feature_bag_of_words(x_train, x_test=None, list_of_cols=[], keep_col=False, **algo_kwargs): """ Creates a matrix of how many times a word appears in a document. Parameters ---------- x_train : DataFrame Training dataset, by default None x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to., by default [] keep_col : bool, optional True if you want to keep the columns passed, otherwise remove it. algo_kwargs : dict, optional Parameters you would pass into Bag of Words constructor as a dictionary., by default {} Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ enc = CountVectorizer(**algo_kwargs) list_of_cols = _get_columns(list_of_cols, x_train) for col in list_of_cols: enc_data = enc.fit_transform(x_train[col]).toarray() enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names()) x_train = drop_replace_columns(x_train, col, enc_df, keep_col) if x_test is not None: enc_x_test = enc.transform(x_test[col]).toarray() enc_test_df = pd.DataFrame(enc_x_test, columns=enc.get_feature_names()) x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col) return x_train, x_test
def feature_one_hot_encode( x_train, x_test=None, list_of_cols=[], keep_col=True, **algo_kwargs ): """ Creates a matrix of converted categorical columns into binary columns of ones and zeros. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing dataset, by default None list_of_cols : list A list of specific columns to apply this technique to. keep_col : bool A parameter to specify whether to drop the column being transformed, by default keep the column, True algo_kwargs : optional Parameters you would pass into Bag of Words constructor as a dictionary, by default {"handle_unknown": "ignore"} Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ enc = OneHotEncoder(handle_unknown="ignore", **algo_kwargs) list_of_cols = _get_columns(list_of_cols, x_train) enc_data = enc.fit_transform(x_train[list_of_cols]).toarray() enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names(list_of_cols)) x_train = drop_replace_columns(x_train, list_of_cols, enc_df, keep_col) if x_test is not None: enc_x_test = enc.transform(x_test[list_of_cols]).toarray() enc_test_df = pd.DataFrame( enc_x_test, columns=enc.get_feature_names(list_of_cols) ) x_test = drop_replace_columns(x_test, list_of_cols, enc_test_df, keep_col) return x_train, x_test
def feature_hash_vectorizer(x_train, x_test=None, list_of_cols=[], keep_col=True, **hashing_kwargs): """ Returns a hashed encoding of text data. Parameters ---------- x_train : DataFrame Training dataset, by default None x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to., by default [] keep_col : bool, optional True if you want to keep the columns passed, otherwise remove it. hashing_kwargs : dict, optional Parameters you would pass into Hashing Vectorizer constructor, by default {} Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ enc = HashingVectorizer(**hashing_kwargs) list_of_cols = _get_columns(list_of_cols, x_train) for col in list_of_cols: enc_data = enc.fit_transform(x_train[col]).toarray() enc_df = pd.DataFrame(enc_data) x_train = drop_replace_columns(x_train, col, enc_df, keep_col) if x_test is not None: enc_x_test = enc.transform(x_test[col]).toarray() enc_test_df = pd.DataFrame(enc_x_test) x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col) return x_train, x_test
def polynomial_features(x_train, x_test=None, list_of_cols=[], **poly_kwargs): """ Computes polynomial features from your existing features. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to If `list_of_cols` is not provided, the strategy will be applied to all numeric columns, by default [] keep_col : bool, optional True to not remove the columns, by default False poly_kwargs : dict or kwargs Polynomial Features constructor key word arguments Returns ------- Dataframe, *Dataframe Transformed dataframe with rows normalized. Returns 2 Dataframes if x_test is provided. """ poly = PolynomialFeatures(**poly_kwargs) list_of_cols = _numeric_input_conditions(list_of_cols, x_train) scaled_data = poly.fit_transform(x_train[list_of_cols]) scaled_df = pd.DataFrame(scaled_data, columns=poly.get_feature_names()) x_train = drop_replace_columns(x_train, list_of_cols, scaled_df) if x_test is not None: scaled_x_test = poly.transform(x_test) scaled_test_df = pd.DataFrame(scaled_x_test, columns=poly.get_feature_names()) x_test = drop_replace_columns(x_test, list_of_cols, scaled_test_df) return x_train, x_test