Example #1
0
File: clean.py Project: yash309/SRA
def cleaning(data, y, params):
    size = len(data)
    ids = ["ID", "id", "Id", "iD"]

    def isId(i):
        for j in ids:
            if j in i:
                return True
        return False

    print("=" * 100)
    print("Selecting unnecessary columns for dropping...")
    drp = set()
    for i in data.columns:
        if (isId(i)) or (data[i].dtype == 'O'
                         and len(data[i].value_counts()) > 300):
            drp.add(i)

    params["nullcount"] = sum(data.isnull().sum() > 0)

    for i in data.columns:
        if data[i].isnull().sum() >= size // 2:
            drp.add(i)

    data.drop(drp, axis=1, inplace=True)
    print("Dropped Columns - ", drp)

    print("=" * 100)

    print("Selecting Categorical and Numerical columns...")
    cat_col = []
    num_col = []
    for i in data.columns:
        if data[i].dtype == 'O':
            cat_col.append(i)

    num_col = list(data.select_dtypes(include=np.number).columns)
    print("Categorical columns are: ", cat_col)
    print("Numerical Columns are: ", num_col)

    print("=" * 100)

    print("Filling NULL values...")
    for i in cat_col:
        data[i] = data[i].fillna("Missing")

    imp = IterativeImputer(random_state=123)
    data[num_col] = pd.DataFrame(imp.fit_transform(data[num_col]),
                                 columns=num_col)
    print("NULL Values removed!")
    print("Checking is any null values are left: ",
          any(data.isnull().sum() > 0))

    print("=" * 100)

    if len(cat_col) != 0:
        for i in cat_col:
            temp = data.groupby(i).size() / size
            data.loc[:, i + '_val'] = data[i].map(temp)
            data.loc[data[i + '_val'] <= 0.01, i] = 'Rare'
            data.drop(i + '_val', axis=1, inplace=True)

        print("Creating dummy variables...")
        data = pd.get_dummies(data, drop_first=True)

    print("Final shape of dataset: ", data.shape)

    print("=" * 100)

    print("Dropping Highly correlated features...")
    corr_matrix = data.corr().abs()
    upper = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [
        column for column in upper.columns if any(upper[column] >= 0.80)
    ]
    data.drop(to_drop, axis=1, inplace=True)
    print("Features Dropped: ", to_drop)

    print("=" * 100)

    print("Selecting important Features...")
    mms = MinMaxScaler()
    data_scaled = mms.fit_transform(data)
    selector1 = LassoCV()
    selector1 = SelectFromModel(selector1)
    selector1.fit(data_scaled, y)
    selector1.get_support()
    feat_selected1 = list(data.columns[(selector1.get_support())])

    feat_imp = set()
    temp1 = list((selector1.estimator_.coef_)[(selector1.get_support())])
    mx = max(temp1)
    mn = min(temp1)

    def transform(x):
        return (x - mn) / (mx - mn)

    for i in range(len(temp1)):
        feat_imp.add(
            (feat_selected1[i], round(transform(abs(temp1[i])) * 100, 2)))
    feat_imp = list(feat_imp)
    feat_imp.sort(key=lambda x: x[1], reverse=True)
    feat_show = feat_imp[:min(5, len(feat_imp))]
    feat_show[-1] = (feat_show[-1][0], feat_show[-1][1] + 1)
    params["features"] = feat_show

    print(params["features"])

    selector2 = RandomForestRegressor()
    selector2 = SelectFromModel(selector2)
    selector2.fit(data_scaled, y)
    selector2.get_support()
    feat_selected2 = list(data.columns[(selector2.get_support())])
    feat_selected = list(set(feat_selected1 + feat_selected2))

    print("Found {} important features".format(len(feat_selected)))

    params["feat_count"] = len(feat_selected)

    data_feat = data[feat_selected]
    print("final shape of dataset", data_feat.shape)

    print("=" * 100)

    return data_feat, y, params