def plot_cat_scatter(df, x_cols, y, format='png'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    if y in x_cols:
        from util.exceptions import ConfigError
        raise ConfigError(
            'The label "{}" was also configured as a feature in x_cols for function "plot_cat_scatter".'
            .format(y))
    import seaborn as sns
    import matplotlib.ticker as ticker
    # TODO: check y is not in x_cols
    for col in x_cols:
        g = sns.catplot(x=col, y=y, data=df)
        ax = g.axes[0, 0]
        ax.set_xticklabels(df[col].unique().tolist(), rotation=90)
        N = df[col].nunique()
        plt.xticks(range(int(N)))
        plt.gca().margins(x=0)
        plt.gcf().canvas.draw()
        m = 0.1
        margin = m / plt.gcf().get_size_inches()[0]
        plt.gcf().subplots_adjust(left=margin, right=1. - margin, bottom=0.333)
        plt.gcf().set_size_inches(30, 7)

        plt.savefig('./tmp/images/{}_to_{}_cat_scatter.{}'.format(
            to_file_save_name(col), to_file_save_name(y), format),
                    format=format)
        plt.close(plt.gcf())
Example #2
0
def train_val_test_split(df, label=None, split=[0.6, 0.2, 0.2], seed=None):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)

    if not np.sum(split) == 1.0:
        raise DataSplitError(np.sum(split))

    df_new = df.copy()
    if seed == None or seed == 'None':
        df_new = shuffle(df_new)
    else:
        df_new = shuffle(df_new, random_state=seed)

    m = len(df_new)
    train_end = int(split[0] * m)
    test_end = int(split[1] * m) + train_end

    train = df_new[:train_end]
    test = df_new[train_end:test_end]
    val = df_new[test_end:]

    if label == None:
        return train.values, _, test.values, _, val.values, _

    y_train = train[label].values
    X_train = train.drop([label], axis=1).values
    y_test = test[label].values
    X_test = test.drop([label], axis=1).values
    y_val = val[label].values
    X_val = val.drop([label], axis=1).values

    return X_train, y_train, X_test, y_test, X_val, y_val
Example #3
0
def apply_binary_encoding(df, categorical_columns):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    import category_encoders as ce
    encoder = ce.BinaryEncoder(cols=categorical_columns).fit(df.values)
    X_transformed = encoder.transform(df)
    return X_transformed
Example #4
0
def describe_data(df, name, n, include='all'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)

    file_content = 'Data description for {}\n'.format(name)

    file_content += df.describe(include=include).T.to_string()

    file_content += '\n\nThe data set has {} columns.'.format(len(df.columns))

    file_content += '\n\nThe data set has {} NaN values.'.format(
        df.isnull().values.sum())

    file_content += '\n\nDataFrame data types:\n\n'

    file_content += df.dtypes.to_string()

    file_content += '\n\nDataFrame head{}:\n\n'.format(n)

    file_content += df.head(n).to_string()

    file_content += '\n\n DataFrame tail{}:\n\n'.format(n)

    file_content += df.tail(n).to_string()

    to_txt_with_versioning('./tmp/description/{}'.format(name), file_content)
Example #5
0
def apply_weight_of_evidence_encoding(df, categorical_columns, label='y'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    import category_encoders as ce
    encoder = ce.WOEEncoder(cols=categorical_columns).fit(
        df.drop([label], axis=1), df[label])
    X_transformed = encoder.transform(df)
    return X_transformed
Example #6
0
def drop_csv_column(df):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    if 'Unnamed: 0' in df.columns:
        df.drop(['Unnamed: 0'], axis='columns', inplace=True)
    if 'Unnamed: 0.1' in df.columns:
        df.drop(['Unnamed: 0.1'], axis='columns', inplace=True)
    return df
Example #7
0
def apply_sum_encoding(df, categorical_columns):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    import category_encoders as ce
    encoder = ce.SumEncoder(cols=categorical_columns).fit(df.values)
    X_transformed = encoder.transform(df)
    X.drop(['intercept'], inplace=True, axis=1)
    return X_transformed
Example #8
0
def extract_X_y(df, label):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    if label == None:
        y = None
    else:
        y = df[label].values
    X = df.drop([label], axis=1).values
    return X, y
Example #9
0
def clean_labels(df, label):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    classes = df[label].unique().tolist()

    def transform_labels(row, label, classes):
        return classes.index(row[label])

    df[label] = df.apply(transform_labels, args=(label, classes), axis=1)
    return df
Example #10
0
def apply_leave_one_out_encoding(df, categorical_columns, label='y'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    import category_encoders as ce
    encoder = ce.LeaveOneOutEncoder(return_df=True,
                                    cols=categorical_columns).fit(
                                        df.drop([label], axis=1), df[label])
    X_transformed = encoder.transform(df.drop([label], axis=1))
    X_transformed[label] = df[label]
    return X_transformed
def plot_regression(df, cols, cat_cols, format='png'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    import seaborn as sns
    import itertools
    for x_col, y_col in itertools.combinations(cols, 2):
        for cat_col in cat_cols:
            g = sns.lmplot(x=x_col, y=y_col, col=cat_col, data=df)
            plt.savefig('./tmp/images/{}_to_{}_by_{}_regression.{}'.format(
                to_file_save_name(x_col), to_file_save_name(y_col),
                to_file_save_name(cat_col), format),
                        format=format)
            plt.close(plt.gcf())
Example #12
0
def clean_numeric(df, cols):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)

    def is_number(s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    for col in cols:
        df = df[df[col].apply(lambda x: is_number(x))]
    return df
def apply_pca(df, label=None, n_components=None):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    from sklearn.decomposition import PCA
    pca = PCA(n_components=n_components)
    if label == None:
        X = pd.DataFrame(pca.fit_transform(df),
                         columns=['PCA%i' % i for i in range(n_components)],
                         index=df.index)
    else:
        X = pd.DataFrame(pca.fit_transform(df.drop([label], axis=1)),
                         columns=['PCA%i' % i for i in range(n_components)],
                         index=df.index)
        X[label] = df[label]
    return X
def plot_scatter(df, cols, y, format='png'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    if y in cols:
        from util.exceptions import ConfigError
        raise ConfigError(
            'The label "{}" was also configured as a feature in cols for function "plot_scatter".'
            .format(y))
    import seaborn as sns
    import itertools
    for x_col, y_col in itertools.combinations(cols, 2):
        g = sns.scatterplot(x=x_col, y=y_col, hue=y, data=df, s=10, alpha=0.4)
        plt.savefig('./tmp/images/{}_to_{}_by_{}_scatter.{}'.format(
            to_file_save_name(x_col), to_file_save_name(y_col),
            to_file_save_name(y), format),
                    format=format)
        plt.close(plt.gcf())
def plot_boxplots(df, columns=-1, label=None, format='png'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)

    if columns == -1:
        columns = df.columns

    for col in columns:
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)
        if label == None or col == label:
            df.boxplot(column=col, ax=ax)
        else:
            df.boxplot(column=col, by=label, ax=ax)
        plt.savefig('./tmp/images/{}_box.{}'.format(to_file_save_name(col),
                                                    format),
                    format=format)
        plt.close(fig)
def plot_cat_point(df, x_cols, y, format='png'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    if y in x_cols:
        from util.exceptions import ConfigError
        raise ConfigError(
            'The label "{}" was also configured as a feature in x_cols for function "plot_cat_point".'
            .format(y))
    import seaborn as sns
    import matplotlib.ticker as ticker
    import itertools
    for x_col, hue_col in itertools.combinations(x_cols, 2):
        g = sns.catplot(x=x_col, y=y, hue=hue_col, kind='point', data=df)
        ax = g.axes[0, 0]
        ax.set_xticklabels(df[x_col].unique().tolist(), rotation=90)
        N = df[x_col].nunique()
        plt.xticks(range(int(N)))
        plt.gca().margins(x=0)
        plt.gcf().canvas.draw()
        m = 0.1
        margin = m / plt.gcf().get_size_inches()[0]
        plt.gcf().subplots_adjust(left=margin, right=1. - margin, bottom=0.333)
        plt.gcf().set_size_inches(30, 7)
        # ax = g.axes[0,0]
        # N = df[col].nunique()
        # plt.xticks(range(N))
        # plt.gca().margins(x=0)
        # plt.gcf().canvas.draw()
        # tl = plt.gca().get_xticklabels()
        # maxsize = max([t.get_window_extent().width for t in tl])
        # m = 0.2 # inch margin
        # s = maxsize/plt.gcf().dpi*N+2*m
        # margin = m/plt.gcf().get_size_inches()[0]
        #
        # plt.gcf().subplots_adjust(left=margin, right=1.-margin)
        # plt.gcf().set_size_inches(s, plt.gcf().get_size_inches()[1])
        plt.savefig('./tmp/images/{}_to_{}_hue_{}_cat_point.{}'.format(
            to_file_save_name(x_col), to_file_save_name(y),
            to_file_save_name(hue_col), format),
                    format=format)
        plt.close(plt.gcf())
Example #17
0
def replace_NaN_with_mean(df):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    mean = df.mean(numeric_only=True).to_dict()
    df.fillna(mean, inplace=True)
    return df
Example #18
0
def remove_NaN_columns(df, how='any'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    df.dropna(how=how, axis='columns', inplace=True)
    return df
Example #19
0
def apply_pca(df, categorical_columns, label='y'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    # TODO: implement
    return None
Example #20
0
def apply_dummy_encoding(df):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    X_transformed = pd.get_dummies(df)
    return X_transformed
Example #21
0
def drop_columns(df, cols):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    df.drop(cols, axis='columns', inplace=True)
    return df
Example #22
0
def apply_standard_scaler_scaling(df):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    from sklearn.preprocessing import StandardScaler
    df[df.columns] = StandardScaler().fit_transform(df[df.columns])
    return df