예제 #1
0
파일: numeric.py 프로젝트: nperera0/aethos
def scale(x_train,
          x_test=None,
          list_of_cols=[],
          method="minmax",
          keep_col=False,
          **algo_kwargs):
    """
    Scales data according to a specific method.

    Parameters
    ----------
    x_train : DataFrame
        Dataset
        
    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to
        If `list_of_cols` is not provided, the strategy will be
        applied to all numeric columns, by default []

    method : str, optional
        Scaling method, by default 'minmax'

    keep_col : bool, optional
        True to not remove the columns, by default False

    algo_kwargs : optional
        Parmaters to pass into the scaler constructor
        from Scikit-Learn, by default {}
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with rows normalized.

    Returns 2 Dataframes if x_test is provided. 
    """

    list_of_cols = _numeric_input_conditions(list_of_cols, x_train)
    scaler = SCALER[method](**algo_kwargs)

    scaled_data = scaler.fit_transform(x_train[list_of_cols])
    scaled_df = pd.DataFrame(scaled_data, columns=list_of_cols)
    x_train = drop_replace_columns(x_train,
                                   list_of_cols,
                                   scaled_df,
                                   keep_col=keep_col)

    if x_test is not None:
        scaled_x_test = scaler.transform(x_test[list_of_cols])
        scaled_test_df = pd.DataFrame(scaled_x_test, columns=list_of_cols)
        x_test = drop_replace_columns(x_test,
                                      list_of_cols,
                                      scaled_test_df,
                                      keep_col=keep_col)

    return x_train, x_test
예제 #2
0
파일: numeric.py 프로젝트: nperera0/aethos
def replace_missing_mean_median_mode(
    x_train, x_test=None, list_of_cols=[], strategy=""
):
    """
    Replaces missing values in every numeric column with the mean, median or mode of that column specified by strategy.

    Mean: Average value of the column. Effected by outliers.
    Median: Middle value of a list of numbers. Equal to the mean if x_train follows normal distribution. Not effected much by anomalies.
    Mode: Most common number in a list of numbers.
    
    Parameters
    ----------
    x_train: Dataframe or array like - 2d
        Dataset

    x_test: Dataframe or array like - 2d
        Testing dataset, by default None.

    list_of_cols : list, optional
        A list of specific columns to apply this technique to
        If `list_of_cols` is not provided, the strategy will be
        applied to all numeric columns., by default []

    strategy : str
        Strategy for replacing missing values.
        Can be either "mean", "median" or "most_frequent"
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with rows with a missing values in a specific column are missing

    Returns 2 Dataframes test if x_test is provided.  
    """

    if strategy != "most_frequent":
        list_of_cols = _numeric_input_conditions(list_of_cols, x_train)
    else:
        list_of_cols = _get_columns(list_of_cols, x_train)

    imp = SimpleImputer(strategy=strategy)

    fit_data = imp.fit_transform(x_train[list_of_cols])
    fit_df = pd.DataFrame(fit_data, columns=list_of_cols)
    x_train = drop_replace_columns(x_train, list_of_cols, fit_df)

    if x_test is not None:
        fit_x_test = imp.transform(x_test[list_of_cols])
        fit_test_df = pd.DataFrame(fit_x_test, columns=list_of_cols)
        x_test = drop_replace_columns(x_test, list_of_cols, fit_test_df)

    return x_train, x_test
예제 #3
0
def feature_tfidf(x_train,
                  x_test=None,
                  list_of_cols=[],
                  keep_col=True,
                  **algo_kwargs):
    """
    Creates a matrix of the tf-idf score for every word in the corpus as it pertains to each document.
    
    Either the full data or training data plus testing data MUST be provided, not both.
    
    Parameters
    ----------
    x_train : DataFrame
        Dataset
        
    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to, by default []

    keep_col : bool, optional
        True if you want to keep the columns passed, otherwise remove it.

    algo_kwargs :  optional
        Parameters you would pass into TFIDF constructor, by default {}
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column

    Returns 2 Dataframes if x_test data is provided. 
    """

    enc = TfidfVectorizer(**algo_kwargs)
    list_of_cols = _get_columns(list_of_cols, x_train)

    for col in list_of_cols:
        enc_data = enc.fit_transform(x_train[col]).toarray()
        enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names())
        x_train = drop_replace_columns(x_train, col, enc_df, keep_col)

        if x_test is not None:
            enc_x_test = enc.transform(x_test[col]).toarray()
            enc_test_df = pd.DataFrame(enc_x_test,
                                       columns=enc.get_feature_names())
            x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col)

    return x_train, x_test
예제 #4
0
def feature_bag_of_words(x_train,
                         x_test=None,
                         list_of_cols=[],
                         keep_col=False,
                         **algo_kwargs):
    """
    Creates a matrix of how many times a word appears in a document.
    
    Parameters
    ----------
    x_train : DataFrame
        Training dataset, by default None
        
    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to., by default []

    keep_col : bool, optional
        True if you want to keep the columns passed, otherwise remove it.

    algo_kwargs : dict, optional
        Parameters you would pass into Bag of Words constructor as a dictionary., by default {}
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    enc = CountVectorizer(**algo_kwargs)
    list_of_cols = _get_columns(list_of_cols, x_train)

    for col in list_of_cols:
        enc_data = enc.fit_transform(x_train[col]).toarray()
        enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names())
        x_train = drop_replace_columns(x_train, col, enc_df, keep_col)

        if x_test is not None:
            enc_x_test = enc.transform(x_test[col]).toarray()
            enc_test_df = pd.DataFrame(enc_x_test,
                                       columns=enc.get_feature_names())
            x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col)

    return x_train, x_test
예제 #5
0
def feature_one_hot_encode(
    x_train, x_test=None, list_of_cols=[], keep_col=True, **algo_kwargs
):
    """
    Creates a matrix of converted categorical columns into binary columns of ones and zeros.
    
    Parameters
    ----------
    x_train : DataFrame
        Dataset

    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list
         A list of specific columns to apply this technique to.

    keep_col : bool
        A parameter to specify whether to drop the column being transformed, by default
        keep the column, True

    algo_kwargs : optional
        Parameters you would pass into Bag of Words constructor as a dictionary, by default {"handle_unknown": "ignore"}
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    enc = OneHotEncoder(handle_unknown="ignore", **algo_kwargs)
    list_of_cols = _get_columns(list_of_cols, x_train)

    enc_data = enc.fit_transform(x_train[list_of_cols]).toarray()
    enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names(list_of_cols))
    x_train = drop_replace_columns(x_train, list_of_cols, enc_df, keep_col)

    if x_test is not None:
        enc_x_test = enc.transform(x_test[list_of_cols]).toarray()
        enc_test_df = pd.DataFrame(
            enc_x_test, columns=enc.get_feature_names(list_of_cols)
        )
        x_test = drop_replace_columns(x_test, list_of_cols, enc_test_df, keep_col)

    return x_train, x_test
예제 #6
0
def feature_hash_vectorizer(x_train,
                            x_test=None,
                            list_of_cols=[],
                            keep_col=True,
                            **hashing_kwargs):
    """
    Returns a hashed encoding of text data.
    
    Parameters
    ----------
    x_train : DataFrame
        Training dataset, by default None
        
    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to., by default []

    keep_col : bool, optional
        True if you want to keep the columns passed, otherwise remove it.

    hashing_kwargs : dict, optional
        Parameters you would pass into Hashing Vectorizer constructor, by default {}
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    enc = HashingVectorizer(**hashing_kwargs)
    list_of_cols = _get_columns(list_of_cols, x_train)

    for col in list_of_cols:
        enc_data = enc.fit_transform(x_train[col]).toarray()
        enc_df = pd.DataFrame(enc_data)
        x_train = drop_replace_columns(x_train, col, enc_df, keep_col)

        if x_test is not None:
            enc_x_test = enc.transform(x_test[col]).toarray()
            enc_test_df = pd.DataFrame(enc_x_test)
            x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col)

    return x_train, x_test
예제 #7
0
파일: numeric.py 프로젝트: nperera0/aethos
def polynomial_features(x_train, x_test=None, list_of_cols=[], **poly_kwargs):
    """
    Computes polynomial features from your existing features.
    
    Parameters
    ----------
    x_train : DataFrame
        Dataset
        
    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to
        If `list_of_cols` is not provided, the strategy will be
        applied to all numeric columns, by default []

    keep_col : bool, optional
        True to not remove the columns, by default False

    poly_kwargs : dict or kwargs
        Polynomial Features constructor key word arguments
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with rows normalized.

    Returns 2 Dataframes if x_test is provided.
    """

    poly = PolynomialFeatures(**poly_kwargs)
    list_of_cols = _numeric_input_conditions(list_of_cols, x_train)

    scaled_data = poly.fit_transform(x_train[list_of_cols])
    scaled_df = pd.DataFrame(scaled_data, columns=poly.get_feature_names())
    x_train = drop_replace_columns(x_train, list_of_cols, scaled_df)

    if x_test is not None:
        scaled_x_test = poly.transform(x_test)
        scaled_test_df = pd.DataFrame(scaled_x_test, columns=poly.get_feature_names())
        x_test = drop_replace_columns(x_test, list_of_cols, scaled_test_df)

    return x_train, x_test