Python DataCleaner.clean Examples

Programming Language: Python

Namespace/Package Name: data_cleaning

Class/Type: DataCleaner

Method/Function: clean

Examples at hotexamples.com: 3

Python DataCleaner.clean - 3 examples found. These are the top rated real world Python examples of data_cleaning.DataCleaner.clean extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DataCleaner(6)

clean(3)

get_clean(2)

filter_frequent_request_types(1)

resolve_nan(1)

remove_space_from_col_names(1)

kaggle(1)

get_missing_value_count(1)

format_zip_code(1)

drop_empty_null_values(1)

drop_unwanted_cols(1)

calculate_time_to_resolve_in_seconds(1)

drop_columns(1)

drop_below_threshold(1)

create_separate_day_month_year_col(1)

columns_with_no_nan(1)

check_if_copyright(1)

capitalize_cols(1)

update_burrow_city_from_zip_code(1)

Example #1

Show file

File: main.py Project: sbrunelli/lead_convertion_predictive_model

import numpy as np
from data_cleaning import DataCleaner
from features_engineering import FeatureExtractor
from model_selection import ModelSelector
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
plt.interactive(True)

if __name__ == '__main__':
    # read and clean the data
    dc = DataCleaner()
    data = dc.clean()

    # Debug transformations
    # data.to_csv('./data/debug.csv', index=False, encoding='latin1')
    # assert False

    # separate target variable
    target = data.pop('Target')

    # train test split
    data_train, data_test, target_train, target_test = train_test_split(
        data, target)

    # featurize data
    featurizer = FeatureExtractor()
    X_train = featurizer.featurize(data_train)
    X_test = featurizer.featurize(data_test)

    # Convert to numpy arrays
    y_train = np.array(target_train)

Example #2

Show file

    # Create sub_area categorical with all levels shared
    # between train and test to avoid errors
    test['price_doc'] = -99
    merged = pd.concat([train, test], axis=0)
    merged = merged.merge(gps, how='left', on='sub_area')
    merged['sub_area'] = merged.sub_area.astype('category')
    train = merged[merged.price_doc != -99]
    test = merged[merged.price_doc == -99]
    test.pop('price_doc')

    macro = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
    train = train.merge(macro, how='left', on='timestamp', suffixes=('_train', '_macro'))

    # Clean
    dc = DataCleaner(data=train, sample_rate=0.3)
    data, y = dc.clean()
    y = np.array(y)
    y = np.log(y+1)

    # Train / test split
    data_train, data_test, y_train, y_test = train_test_split(data, y, random_state=77)
    house_ids_test = data_test.id

    # Featurize training data set
    feat_train = Featurizer()
    X_train = feat_train.featurize(data_train)

    # Grid search tune all estimators
    ms = ModelSelector()
    print ' # {:s} | X_train shape: {:s}'.format(now(), X_train.shape)
    print ' # {:s} | y_train size: {:d}'.format(now(), y_train.shape[0])

Example #3

Show file

File: features_extraction.py Project: sbrunelli/russian_housing_market

    gps = pd.read_csv('./data/Longitud_Latitud.csv')
    # Create sub_area categorical with all levels shared
    # between train and test to avoid errors
    test['price_doc'] = -99
    merged = pd.concat([train, test], axis=0)
    merged = merged.merge(gps, how='left', on='sub_area')
    merged['sub_area'] = merged.sub_area.astype('category')
    train = merged[merged.price_doc != -99]

    train = train.merge(macro,
                        how='left',
                        on='timestamp',
                        suffixes=('_train', '_macro'))

    dc = DataCleaner(data=train)
    train, y = dc.clean()
    y = np.array(y)
    y = np.log(y + 1)

    # Featurize training data set
    feat_train = Featurizer()
    train = feat_train.featurize(train)

    print 'train shape', train.shape

    # # Remove all categorical variables for now
    # mask = ~(train.dtypes == 'object').values
    # train = train.iloc[:, mask]
    # print 'train shape with only numerical features', train.shape

    # Print NAs proportions for features with NA values