Exemple #1
0
    def clean_data(self, replace_data=False):
        """
        Attempt to clean the training_data and testing_data using datacleaner.autoclean
        """
        clean_training, clean_testing = autoclean(
            self.training_data), autoclean(self.testing_data)

        if replace_data:
            self.set_data(clean_training, clean_testing)

        return clean_training, clean_testing
Exemple #2
0
def test_autoclean_with_nans_with_strings():
    """Test autoclean() with a data set that has some string-encoded categorical values and some NaNs"""
    data = pd.DataFrame({
        'A': np.random.rand(1000),
        'B': np.random.rand(1000),
        'C': np.random.randint(0, 3, 1000)
    })

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    data.loc[10:20, 'A'] = np.nan
    data.loc[50:70, 'C'] = np.nan
    print data
    hand_cleaned_data = data.copy()
    hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(),
                                  inplace=True)
    hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0],
                                  inplace=True)
    hand_cleaned_data['C'] = LabelEncoder().fit_transform(
        hand_cleaned_data['C'].values)
    print hand_cleaned_data
    cleaned_data = autoclean(data)
    print cleaned_data
    assert cleaned_data.equals(hand_cleaned_data)
    def preprocess_data(self, data, target_column):

        clean_data = autoclean(data)
        clean_data = self.remove_unnamed_columns(clean_data)
        X = clean_data.drop(target_column, axis=1)
        y = clean_data[target_column]

        return X, y
Exemple #4
0
def test_autoclean_already_clean_data():
    """Test autoclean() with already-clean data"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    cleaned_data = autoclean(data)

    # autoclean() should not change the data at all
    assert cleaned_data.equals(data)
def test_autoclean_already_clean_data():
    """Test autoclean() with already-clean data"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    cleaned_data = autoclean(data)

    # autoclean() should not change the data at all
    assert cleaned_data.equals(data)
    def load_dataset(self, file_name):
        """Load the dataset into memory."""
        self._data = autoclean(pd.read_hdf('./uploads/%s.hdf' % file_name),
                               drop_nans=True)
        self._column_names = self._data.columns.values

        self._active_x = self._column_names[0]
        self._active_y = self._column_names[1]
        if len(self._column_names) > 2:
            self._active_z = self._column_names[2]
Exemple #7
0
 def do_autoclean(self):
     if self.test_df is not None:
         self.train_df, self.test_df = do_autoclean_cv(
             self.train_df,
             self.test_df,
             do_autoclean=self.cfg.get('do_autoclean'),
             predict_colname=self.cfg['predict_colname'])
     else:
         self.train_df = datacleaner.autoclean(self.train_df,
                                               ignore_update_check=True)
Exemple #8
0
def clean_csv(csvfile, basedir):
    '''
    https://github.com/rhiever/datacleaner
    '''
    input_dataframe = pd.read_csv(csvfile)
    newframe = datacleaner.autoclean(input_dataframe,
                                     drop_nans=False,
                                     copy=False,
                                     ignore_update_check=False)
    newfile = 'clean_' + csvfile
    newframe.to_csv(newfile, index=False)
    return [newfile]
Exemple #9
0
def explore_features(df):
    df_copy = df.copy()

    #for some reason, the visualize doesn't accept categorical
    #variables. those have to be converted to strings
    for (col, data) in df_copy.iteritems():
        if df_copy[col].dtype.name == "category":
            df_copy[col] = df_copy[col].astype(str)

    numeric_df = autoclean(df_copy)
    visualizer = Rank2D(algorithm="pearson")
    visualizer.fit_transform(numeric_df)
    visualizer.poof()
def load_selected_file(file_name):
    """Load the selected file, make sure to clean it up before use.

    Parameters
    ----------
    file_name : str
        File name to load

    Returns
    -------
    pandas.dataframe
    """
    return autoclean(pd.read_hdf('./uploads/%s.hdf' % file_name),
                     drop_nans=True)
def clean_data(original_df):
    #mad方法
    temp_df = copy.deepcopy(original_df)
    del temp_df["index"]
    del temp_df["trade_date"]
    del temp_df["stock_code"]    
    del temp_df["trade_status"]
    del temp_df["data_source"]
    del temp_df["created_date"]
    #用datacleaner的autoclean方法处理缺失值
    #只要某列有数据不为空,那么最后一定会补全整列数据。
    #如果整列为空,那么则整列无法修补数据
    # temp_df.loc[10:20, 'ev2_to_ebitda'] = 30
    clean_data_df = autoclean(temp_df)
    # clean_data_df.to_csv('auto_clean.csv', sep=',', index=False)
    # print clean_data_df
#############由于季度数据得到的增长率,出现有规律的0值,需要填充##############################
    for i in range(len(clean_data_df["grossprofitmargin_growthrate"])):
        if clean_data_df["grossprofitmargin_growthrate"][i] != 0 and pd.isnull(clean_data_df["grossprofitmargin_growthrate"][i]) == False:
            clean_data_df["grossprofitmargin_growthrate"][i-1] = clean_data_df["grossprofitmargin_growthrate"][i]
            clean_data_df["grossprofitmargin_growthrate"][i-2] = clean_data_df["grossprofitmargin_growthrate"][i]

    for i in range(len(clean_data_df["roe_ttm2_growthrate"])):
        if clean_data_df["roe_ttm2_growthrate"][i] != 0 and pd.isnull(clean_data_df["roe_ttm2_growthrate"][i]) == False:
            clean_data_df["roe_ttm2_growthrate"][i-1] = clean_data_df["roe_ttm2_growthrate"][i]
            clean_data_df["roe_ttm2_growthrate"][i-2] = clean_data_df["roe_ttm2_growthrate"][i]

    for i in range(len(clean_data_df["roa_ttm2_growthrate"])):
        if clean_data_df["roa_ttm2_growthrate"][i] != 0 and pd.isnull(clean_data_df["roa_ttm2_growthrate"][i]) == False:
            clean_data_df["roa_ttm2_growthrate"][i-1] = clean_data_df["roa_ttm2_growthrate"][i]
            clean_data_df["roa_ttm2_growthrate"][i-2] = clean_data_df["roa_ttm2_growthrate"][i]

    # clean_data_df.to_csv('auto_clean1.csv', sep=',', index=False)
###############################################################################
    # print clean_data_df.columns
    for column in clean_data_df.columns:
        clean_data_df[column] = mad_based_outlier(clean_data_df[column])
        # print max(clean_data_df[column])
    #标准化
    #方法一
    # clean_data_df = clean_data_df.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) 
    #方法二
    # (clean_data_df - clean_data_df.min()) / (clean_data_df.max() - clean_data_df.min())
    #Z标准化
    clean_data_df = clean_data_df.apply(lambda x: (x - np.average(x)) / np.std(x)) 

    # clean_data_df.to_csv('clean_data_df1.csv', sep=',', index=False)

    return clean_data_df
Exemple #12
0
def test_autoclean_no_nans_with_strings():
    """Test autoclean() with a data set that has some string-encoded categorical values and no NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    hand_cleaned_data = data.copy()
    hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)

    cleaned_data = autoclean(data)

    assert cleaned_data.equals(hand_cleaned_data)
def test_autoclean_no_nans_with_strings():
    """Test autoclean() with a data set that has some string-encoded categorical values and no NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    hand_cleaned_data = data.copy()
    hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)

    cleaned_data = autoclean(data)

    assert cleaned_data.equals(hand_cleaned_data)
Exemple #14
0
    def execute(self, params, **kwargs):
        target = params.get("target")
        X = autoclean(self.marvin_initial_dataset.drop(target, axis=1))
        y = self.marvin_initial_dataset[target]

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)

        self.marvin_dataset = {
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test,
        }
Exemple #15
0
def test_autoclean_with_nans_all_numerical():
    """Test autoclean() with a data set that has all numerical values and some NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    data.loc[10:20, 'A'] = np.nan
    data.loc[50:70, 'C'] = np.nan

    hand_cleaned_data = data.copy()
    hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True)
    hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].median(), inplace=True)

    cleaned_data = autoclean(data)

    assert cleaned_data.equals(hand_cleaned_data)
def test_autoclean_with_nans_all_numerical():
    """Test autoclean() with a data set that has all numerical values and some NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    data.loc[10:20, 'A'] = np.nan
    data.loc[50:70, 'C'] = np.nan

    hand_cleaned_data = data.copy()
    hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True)
    hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].median(), inplace=True)

    cleaned_data = autoclean(data)

    assert cleaned_data.equals(hand_cleaned_data)
Exemple #17
0
def test_autoclean_real_data():
    """Test autoclean() with the adult data set"""
    adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip')
    adult_data.loc[30:60, 'age'] = np.nan
    adult_data.loc[90:100, 'education'] = np.nan

    hand_cleaned_adult_data = adult_data.copy()

    hand_cleaned_adult_data['age'].fillna(hand_cleaned_adult_data['age'].median(), inplace=True)
    hand_cleaned_adult_data['education'].fillna(hand_cleaned_adult_data['education'].mode()[0], inplace=True)

    for column in ['workclass', 'education', 'marital-status',
                   'occupation', 'relationship', 'race',
                   'sex', 'native-country', 'label']:
        hand_cleaned_adult_data[column] = LabelEncoder().fit_transform(hand_cleaned_adult_data[column].values)

    cleaned_adult_data = autoclean(adult_data)

    assert cleaned_adult_data.equals(hand_cleaned_adult_data)
def test_autoclean_real_data():
    """Test autoclean() with the adult data set"""
    adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip')
    adult_data.loc[30:60, 'age'] = np.nan
    adult_data.loc[90:100, 'education'] = np.nan

    hand_cleaned_adult_data = adult_data.copy()

    hand_cleaned_adult_data['age'].fillna(hand_cleaned_adult_data['age'].median(), inplace=True)
    hand_cleaned_adult_data['education'].fillna(hand_cleaned_adult_data['education'].mode()[0], inplace=True)

    for column in ['workclass', 'education', 'marital-status',
                   'occupation', 'relationship', 'race',
                   'sex', 'native-country', 'label']:
        hand_cleaned_adult_data[column] = LabelEncoder().fit_transform(hand_cleaned_adult_data[column].values)

    cleaned_adult_data = autoclean(adult_data)

    assert cleaned_adult_data.equals(hand_cleaned_adult_data)
Exemple #19
0
def test_autoclean_with_nans_with_strings():
    """Test autoclean() with a data set that has some string-encoded categorical values and some NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    data.loc[10:20, 'A'] = np.nan
    data.loc[50:70, 'C'] = np.nan

    hand_cleaned_data = data.copy()
    hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True)
    hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0], inplace=True)
    hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)

    cleaned_data = autoclean(data)

    assert cleaned_data.equals(hand_cleaned_data)
Exemple #20
0
def clean_data():

    original_df = pandas_read_data_from_table()

    temp_df = copy.deepcopy(original_df)

    del temp_df["stock_code"]
    del temp_df["industry_type"]

    # my_data = pd.read_csv('my_data.csv', sep=',')
    clean_data_df = autoclean(temp_df)

    del clean_data_df["index"]

    clean_data_df.insert(1, "stock_code", original_df["stock_code"])
    # 此处插入中文乱码
    # print original_df["industry_type"]
    # clean_data_df.insert(3,"industry_type",original_df["industry_type"])

    # clean_data_df.to_csv('clean_data_df.csv', sep=',', index=False)
    return clean_data_df
# this is a logistic regression multi-class classifier
# it predicts which skin condition (of 6 total conditions or classes) is represented by input X

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import datacleaner
from sigmoid import sigmoid
from train_classifier import train_classifier

# import and clean data
data = pd.read_csv('dataset_multi.csv')
clean_data = datacleaner.autoclean(data, True).values
X = np.matrix(clean_data[:, 0:34])
y = np.matrix(clean_data[:, 34:35])

# get size of training data
m = y.shape[0]

# add ones to X
X0 = np.ones((X.shape[0], 1))
X = np.hstack((X0, X))

# get number of labels and set lambda for regularization term
num_labels = y.max()
L = 1

# initialize learning params
alpha = 0.01
iterations = 5000
Exemple #22
0
def load_house_prices(return_X_y=False,
                      return_dataframes=False,
                      data_file='train.csv',
                      test_data_file=None,
                      do_categoricals=True,
                      do_autoclean='drop',
                      predict_colname='SalePrice',
                      do_get_dummies=False,
                      write_transformed_data=True):
    """Load and return the Kaggle Ames Iowa House Prices dataset.

    ==============     =======================
    Samples total                         1460
    Dimensionality                          81
    Features           real, positive, strings
    Targets                real 34900 - 755000
    ==============     =======================
    Parameters
    ----------
    return_X_y : boolean, default=False.
        If True, returns ``(data, target)`` instead of a Bunch object.
        See below for more information about the `data` and `target` object.
    return_dataframes: boolean, default=False
        If true, returns ``(df)`` or ``(df, test_df)``
        instead of a Bunch object.
    data_file : str, default='train.csv'
        The data file to load
    test_data_file : str, default=None
        The test data file to load (e.g. test.csv)

    do_categoricals : bool, default=True
        If True, call df[col].astype('category', categories=[...], ordered=True)
        with each column
    do_autoclean : bool,str, default='drop'
        Whether to datacleaner.autoclean[_cv] the dataset(s)

        - 'drop': drop the predict_colname from train_df before autoclean
        - 'append_mean': test_df[predict_colname]=train_df[predict_colname].mean()
          before autoclean
    predict_colname : str, default='SalePrice'
        The column name of the column to be predicted
    do_get_dummies : bool, default=False
        Whether to run pd.do_get_dummies the dataset(s)
    write_transformed_data: bool, default=True
        If True, write transformed data in a file named with a
        'transformed.csv' suffix

    Returns
    -------
    data : Bunch
        Dictionary-like object, the interesting attributes are:
        'data', the data to learn, 'target', the regression targets,
        and 'DESCR', the full description of the dataset.
    (data, target) : tuple if ``return_X_y`` is True

    Examples
    --------
    >>> from house_prices.data import load_house_prices
    >>> house_prices = load_house_prices()
    >>> print(house_prices.data.shape)
    (1460, 81)
    """
    module_path = dirname(__file__)

    description_filepath = 'data_description.txt'
    fdescr_name = join(module_path, 'data', description_filepath)
    with open(fdescr_name) as f:
        descr_text = f.read()
        f.seek(0)
        column_categories = HousePricesSuperBunch.parse_description(f)

    data_file_name = join(module_path, 'data', data_file)
    df = pd.read_csv(data_file_name, index_col='Id')

    if test_data_file:
        test_data_file_name = join(module_path, 'data', test_data_file)
        test_df = pd.read_csv(test_data_file_name, index_col='Id')

    # TODO
    # if do_categoricals:
    #     HousePricesSuperBunch.do_categoricals(train_df=train_df, test_df=test_df)

    if do_get_dummies:

        def keys_with_values(column_categories):
            for colkey in column_categories:
                values = column_categories[colkey]
                if len(values):
                    yield colkey

        categorical_columns = list(keys_with_values(column_categories))
        get_dummies_dict = {key: key for key in categorical_columns}
        df = pd.get_dummies(df,
                            prefix=get_dummies_dict,
                            columns=get_dummies_dict)
        if test_data_file:
            test_df = pd.get_dummies(test_df,
                                     prefix=get_dummies_dict,
                                     columns=get_dummies_dict)

    if do_autoclean:
        if test_data_file:
            df, test_df = do_autoclean_cv(df,
                                          test_df,
                                          do_autoclean=do_autoclean,
                                          predict_colname=predict_colname)
        else:
            df = datacleaner.autoclean(df, ignore_update_check=True)

    if write_transformed_data:
        transformed_data_filename = (data_file_name + '.transformed.csv'
                                     if write_transformed_data is True else
                                     write_transformed_data)
        df.to_csv(transformed_data_filename)
        if test_data_file:
            clean_test_data_filename = test_data_file_name + '.transformed.csv'
            test_df.to_csv(clean_test_data_filename)

    feature_names = df.columns.tolist()
    target = df['SalePrice'].as_matrix()
    del df['SalePrice']
    if return_dataframes:
        if test_data_file is None:
            return df
        else:
            return df, test_df

    data = df.as_matrix()
    if test_data_file is None:
        if return_X_y:
            return data, target

        return Bunch(
            data=data,
            target=target,
            # last column is target value
            feature_names=feature_names[:-1],
            DESCR=descr_text)
    elif test_data_file:
        if return_X_y:
            return (data, target), (test_df.as_matrix(), None)
        return (Bunch(data=data,
                      target=target,
                      feature_names=feature_names[:-1],
                      DESCR=descr_text),
                Bunch(data=test_df.as_matrix(),
                      target=None,
                      feature_names=test_df.columns.tolist(),
                      DESCR=descr_text))
Exemple #23
0
import pandas as pd
from datacleaner import autoclean
import time
from autofeature.sample_reduction import sample_reduction
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from autofeature.operator_config import config1, config2
from autofeature.autofeature import AutoFeature
from xgboost.sklearn import XGBClassifier

train_file = "/home/learner/project/dataset/titanic/train_tita.csv"
test_file = "/home/learner/project/dataset/titanic/test_tita.csv"
target = "Survived"

df = pd.read_csv(train_file)
df = autoclean(df)

t1 = time.time()
least_df, other_df, least_index = sample_reduction(df, target, 0.9)
least_df.to_csv("/tmp/least.csv", index=False)

test_data = pd.read_csv(test_file, error_bad_lines=False)
test_df = autoclean(test_data)

af = AutoFeature(df,
                 "Survived",
                 20,
                 XGBClassifier,
                 roc_auc_score,
                 "classification",
                 least_index,
Exemple #24
0
# Load the dataframes
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

# Concat the data frames

# This is for autocleaner to work properly

# Since we have categorical variables we will
# need our encoder to label them correctly
# so we must use our encoder on the full
# dataset to avoid having representation
# errors.
data = train_df.append(test_df)
data = autoclean(data)
train, test = data[0:len(train_df)], data[len(train_df):]

# Organize our data for training
X = train.drop(["y"], axis=1)
Y = train["y"]
x_test = test.drop(["y"], axis=1)
X, X_Val, Y, Y_Val = train_test_split(X, Y)

# A parameter grid for XGBoost
params = {
    'min_child_weight': [4, 5],
    'gamma': [i / 10.0 for i in range(3, 6)],
    'subsample': [i / 10.0 for i in range(6, 11)],
    'colsample_bytree': [i / 10.0 for i in range(6, 11)],
    'max_depth': [2, 3, 4]
def clean_data(df):
    my_clean_data = autoclean(df.copy())
    return my_clean_data
Exemple #26
0
def cleaner(data):
    clean = autoclean(data)
    return clean
Exemple #27
0
 def execute(self, input_message, params, **kwargs):
     df = pd.DataFrame(input_message)
     return autoclean(df)
Exemple #28
0
from datacleaner import autoclean

import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.model_selection import train_test_split

from tpot import TPOTRegressor

# =============================================================================
#
# =============================================================================
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test.csv")

train = autoclean(train_data)
test = autoclean(test_data)

# =============================================================================
#
# =============================================================================
X_train, X_test, y_train, y_test = train_test_split(train.SalePrice,
                                                    train.drop('SalePrice',
                                                               axis=1),
                                                    train_size=0.75,
                                                    test_size=0.25)

tpot = TPOTRegressor(generations=5,
                     population_size=20,
                     verbosity=2,
                     config_dict='TPOT light')
Exemple #29
0
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as rfc
from tpot import TPOTClassifier
from datacleaner import autoclean

df = pd.read_csv('file.csv')
rfc = rfc()
clf = TPOTClassifier(generations=5, population_size=20, verbosity=2)
for i  in range(len(df)):
    m = df['Month']
    rain.append(rainfall[m[i]-1])
df['rainfall'] = rain
df = autoclean(df)
X = df.drop('Disease','Larva', 'Diagno')
y = df['#Target label']

X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)
clf.fit(X_train, y_train)
joblib.dump(clf.fitted_pipeline_, 'model.pkl')
    data = pd.read_csv("responses.csv", sep=',')
    data = shuffle(data)
    data_X = data.drop('Empathy', axis=1)
    data_Y = data['Empathy']

    #Drops rows with missing target values
    for i, nullbool in enumerate(data_Y.isnull()):
        if nullbool == True:
            data_Y = data_Y.drop(data.index[i])
            data_X = data_X.drop(data.index[i])
    data_Y = data_Y.reset_index(drop=True)
    data_X = data_X.reset_index(drop=True)
    data = pd.concat([data_X, data_Y], axis=1)
    #Autoclean
    autoclean(data, drop_nans=False, copy=False, ignore_update_check=False)

    ##Split to test set

    train_data = data[:-150]
    test_data = data[-150:]
    test_data_X = test_data.drop('Empathy', axis=1)
    test_data_Y = test_data['Empathy']
    data = train_data
    data_Y = data['Empathy']
    data_X = data.drop('Empathy', axis=1)

    # One hot encoding
    # data_X_O = data_X.drop('Height', axis=1)
    # data_X_O = data_X_O.drop('Weight', axis=1)
    # data_X_O = data_X_O.drop('Age', axis=1)
Exemple #31
0
    def restore_ind(self, ind, df):
        print(str(ind))
        func = gp.compile(ind, self.pset)
        print(str(func))
        feature_dict = {}
        for col in df.columns:
            feature_dict[col] = df[col].values
        new_add = func(**feature_dict)
        return new_add

        


if __name__ == "__main__":
    import pandas as pd
    from operator_config import config1
    from operator_config import config2
    from operator_config import config3
    from datacleaner import autoclean
    raw_data = pd.read_csv("/tmp/train.csv", error_bad_lines=False)
    clean_data = autoclean(raw_data)

    ag = AutoGenerator(clean_data, "Survived", config1)
    new_add_cols, transform_methods = ag.run(popsize=100, matepb=0.7, mutpb=0.2, gensize=10, selectsize=100, kbest=30)





Exemple #32
0

#y_pred = y_pred.astype('float64')
#y_test = y_test.values.astype('float64')

rmsle = rmsle_metric(y_pred, y_test)
#msle = mean_squared_log_error(y_test, y_pred)

rmsle

# A aplicação de SelectKBest e PCA de formas isoladas apresentaram melhor performance no teste com regressão linear com RMSLE de 1,94 ambos.
#
# Esta linha de experimentos será abandonada

# In[6]:

X = train.drop('target', axis=1)
y = train.target

# In[ ]:

from tpot import TPOTRegressor
from datacleaner import autoclean

model = TPOTRegressor(generations=2,
                      population_size=50,
                      scoring='mean_squared_error',
                      n_jobs=4,
                      verbosity=2)
model.fit(autoclean(X), y)
# path = 'traindata_cleaned_1w.csv'
# reader = pd.read_csv(path)


_reader = pd.read_csv(path, iterator = True)
#打开前5行观察数据的类型,列标签
chunkSize = 47000
print 'preproc',chunkSize
reader = _reader.get_chunk(chunkSize)
# print chunk

#test chunk = 1000,all = 10000
# read csv

# chunkSize = 1000

import time
#clock 比time更精准
start = time.clock()
# bda_pre(chunk)
# 删除重复行,默认判断全部列
reader.drop_duplicates()
#做非数字型字符转换成数字
clean_data = autoclean(reader)
# #hash
# hash_bda(reader)
# sort 1->n
sort_bda(reader)
reader.to_csv('traindata_cleaned.csv',index = False)
end = time.clock()
print end
from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from time import time
import matplotlib.pyplot as plt
from operator import itemgetter
from datacleaner import autoclean
from datacleaner import autoclean_cv

# Load training and test data into pandas dataframes
root = 'data/raw/A/'
train = pd.read_csv(root+ 'A_hhold_train.csv')
test = pd.read_csv(root + 'A_hhold_test.csv')


clean_train = autoclean(train)
clean_test = autoclean(test)

clean_train.to_csv(root+'A_hhold_clean_train.csv')
clean_test.to_csv(root+'A_hhold_clean_test.csv')



'''
# merge training and test sets into one dataframe
full = pd.concat([train, test])


#return a formatted percentage from a fraction
def percentage(numerator, denomenator):
    if type(numerator) == pd.core.series.Series: