filename = '{}_{}.csv'.format(config["experiment_name"], name)
                try:
                    #Try to convert to dataframe, it will fail if data is empty
                    df = data.to_df()
                    df.to_csv(os.path.join(path_to_dumps, filename))
                except Exception, e:
                    logger.info('Error saving {} as csv: {}'.format(filename, e))
            else:
                logger.info('{} is None, skipping dump...'.format(name))

    #Impute missing values (mean is the only strategy for now)
    # Note that the features can specify imputation strategies;
    # and if they don't, then they already got a default imputer,
    # which imputes the median (for floats), or 0 (for integers)
    logger.info('Imputing values on train and test...')
    imputer = preprocessing.Imputer().fit(train.x)
    train.x = imputer.transform(train.x)
    test.x = imputer.transform(test.x)
    logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape,
        test.x.shape))
    if args.predicttop:
        preds.x = imputer.transform(preds.x)
        logger.debug('Prediction x shape: {}'.format(preds.x.shape))

    # Scale features to zero mean and unit variance
    logger.info('Scaling train, test...')
    scaler = preprocessing.StandardScaler().fit(train.x)
    train.x = scaler.transform(train.x)
    test.x = scaler.transform(test.x)
    logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape,
        test.x.shape))
    indices = np.argsort(classifier.feature_importances_)[::-1][:40]
    g = sns.barplot(y=X_train.columns[indices][:40],
                    x=classifier.feature_importances_[indices][:40],
                    orient='h')
    g.set_xlabel("Relative importance", fontsize=12)
    g.set_ylabel("Features", fontsize=12)
    g.tick_params(labelsize=9)
    g.set_title("DT feature importances")


titanic_train = pd.read_csv("C:/Users/Algorithmica/Downloads/all/train.csv")
print(titanic_train.shape)
print(titanic_train.info())

imputable_cont_features = ['Age', 'Fare']
cont_imputer = preprocessing.Imputer()
cont_imputer.fit(titanic_train[imputable_cont_features])
print(cont_imputer.statistics_)
titanic_train[imputable_cont_features] = cont_imputer.transform(
    titanic_train[imputable_cont_features])

#impute missing values for categorical features
cat_imputer = CategoricalImputer()
cat_imputer.fit(titanic_train['Embarked'])
print(cat_imputer.fill_)
titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked'])


#creaate categorical age column from age
def convert_age(age):
    if (age >= 0 and age <= 18):
titanic_train = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')
titanic_train.info()
titanic_test['Survived'] = None
titanic_test.info()
titanic_test.shape[0]

titanicAll = pd.concat([titanic_train, titanic_test])
titanicAll.info()

#EDA
titanicAll.shape
titanicAll.info

#create an instance of Imputer class with required arguments
mean_imputer = preprocessing.Imputer()
#compute mean of age and fare respectively
mean_imputer.fit(titanic_train[['Age', 'Fare']])
#fill up the missing data with the computed means
titanicAll[['Age',
            'Fare']] = mean_imputer.transform(titanicAll[['Age', 'Fare']])


#Feature Considered Till now Age, Fare, Survived
#Feature Creation: Creating new feature with Age column to see visualization. To find differences in age groups.
def ageRange(age):
    ageRange = ''
    if age < 16:
        ageRange = 'Child'
    elif age <= 30:
        ageRange = 'Young'
Example #4
0
import numpy as np
import pandas as pd
import sklearn as sk
import sklearn.preprocessing as prepross
import sklearn.model_selection._split as split

dataset = pd.read_csv(
    'C:\DOC\Workspace\Machine Learning A-Z Template Folder\Part 1 - Data Preprocessing\Data.csv'
)

x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Handle missing data
imputer = prepross.Imputer()
x[:, 1:3] = imputer.fit_transform(x[:, 1:3])

# Encoding categorical data
label_encoder = prepross.LabelEncoder()
x[:, 0] = label_encoder.fit_transform(x[:, 0])
y = label_encoder.fit_transform(y)

# One hot
one_hot_encoder_x = prepross.OneHotEncoder(categorical_features=[0])
x = one_hot_encoder_x.fit_transform(x).toarray()

# Remove first column(categorical data trap)
x = np.delete(x, 0, axis=1)

# Split test - train
Example #5
0
# In[7]:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=random_seed)

# ### normalize  train data
#
# fulfill the Na with median, then standardized the data, output type ndarray

# In[8]:

clean_pipeline = Pipeline([
    ('imputer', preprocessing.Imputer(missing_values='NaN',
                                      strategy="median")),
    ('std_scaler', preprocessing.StandardScaler()),
])
X_train = clean_pipeline.fit_transform(X_train)
X_test = clean_pipeline.fit_transform(X_test)

# ## TEST CE

# In[9]:

X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                      y_train,
                                                      test_size=0.25,
                                                      stratify=y_train,
                                                      random_state=random_seed)
Example #6
0
from sklearn import preprocessing

os.chdir('D:\Projects\datasets')

#read and explore data
titanic_train = pd.read_csv('titanic_train.csv')
titanic_train.shape
titanic_train.info()

#create title column from name
def extract_title(name):
     return name.split(',')[1].split('.')[0].strip()
titanic_train['Title'] = titanic_train['Name'].map(extract_title)
sns.factorplot(x="Title", hue="Survived", data=titanic_train, kind="count", size=6)

age_imputer = preprocessing.Imputer()
age_imputer.fit(titanic_train[['Age']])
titanic_train[['Age']] = age_imputer.transform(titanic_train[['Age']])

#creaate categorical age column from age
def convert_age(age):
    if(age >= 0 and age <= 10): 
        return 'Child'
    elif(age <= 25): 
        return 'Young'
    elif(age <= 50): 
        return 'Middle'
    else: 
        return 'Old'
titanic_train['Age1'] = titanic_train['Age'].map(convert_age)
sns.factorplot(x="Age1", hue="Survived", data=titanic_train, kind="count", size=6)
                     header=None)  #header none because no column names
dframe.info()
numdframe = dframe.iloc[:, 1:]
catdframe = dframe.iloc[:, 0]
catdf_encod = categorical(catdframe.values, dictnames=False, drop=True)
numArr = np.asarray(numdframe.values)
catArr = np.asarray(catdf_encod)
Output = numArr[:, 5]
Inp_num = numArr[:, 0:5]
Input = np.concatenate((Inp_num, catArr), axis=1)
Input = np.c_[catArr, Inp_num]
print(Input.shape)

####Q1 (b)########

imp = skp.Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
Input_new = imp.fit_transform(Input)

####Q1 (c)######

X_train, X_test, y_train, y_test = skms.train_test_split(Input_new,
                                                         Output,
                                                         test_size=0.25,
                                                         random_state=111)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(Input)

##### Q1 (d)######

svc_rbf = SVC(kernel='rbf',
              gamma='auto',
    # values in A1:
    # A1:   b, a
    #
    # Use the input by mean number approach to input the missing
    # values in A2:
    # A2:   continuous

    crx_data = pd.read_csv("crx.data", header=None)
    # Since the Japanese Credit Data Set uses "?" to denote missing,
    # replace it to np.nan. scikit-learn"s Imputer only accepts np.nan
    # or integer, therefore, convert "?" to np.nan.
    # This transformation is for A2 which uses scikit-learn"s Imputer.
    # For A1 which uses imputer_by_most_frequent(), this transformation
    # is not necessary.
    crx_data.replace("?", np.nan, inplace=True)

    A1_no_missing = imputer_by_most_frequent(np.nan, crx_data.iloc[:,
                                                                   0].values)
    print(A1_no_missing)

    # Use scikit-learn Imputer to input missing values by mean number.
    # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
    imputer = preprocessing.Imputer(missing_values=np.nan,
                                    strategy="mean",
                                    axis=0)
    # Convert to two-dimension list, since Imputer only accepts
    # two dimensions list.
    A2_two_d = np.array([[item] for item in crx_data.iloc[:, 1].values])
    A2_no_missing = imputer.fit_transform(A2_two_d)
    print(A2_no_missing)
Example #9
0
chr1.read()

read_time = time.time() - start_time

hour, minute, second = pr.time_process(read_time)

print '\n'
print 'Loading time: ' + str(hour) + "h " + str(minute) + "m " + str(
    second) + "s "

start_time = time.time()
chr1.data_extract(strand_binary=True, pos_normalize=True)

from sklearn import preprocessing

imputer = preprocessing.Imputer(copy=False)
imputer.fit_transform(chr1.train_beta)

process_time = time.time() - start_time
hour, minute, second = pr.time_process(process_time)
print '\n'
print 'Processing time: ' + str(hour) + "h " + str(minute) + "m " + str(
    second) + "s "

train_beta_mean = np.mean(chr1.train_beta, axis=1)

predict = train_beta_mean[chr1.sample_nan]

start_time = time.time()
# Normalized square error for prediction
test_not_nan = []
Example #10
0
def main():
    csv_file_object = csv.reader(open('Data/train.csv',
                                      'rb'))  #Load in the training csv file
    header = csv_file_object.next()  #Skip the fist line as it is a header
    train_data = []  #Creat a variable called 'train_data'
    for row in csv_file_object:  #Skip through each row in the csv file
        train_data.append(row[1:])  #adding each row to the data variable
    train_data = np.array(train_data)  #Then convert from a list to an array

    #I need to convert all strings to integer classifiers:
    #Male = 1, female = 0:
    train_data[train_data[0::, 3] == 'male', 3] = -1
    train_data[train_data[0::, 3] == 'female', 3] = 1
    #embark c=0, s=1, q=2
    train_data[train_data[0::, 10] == 'C', 10] = -1
    train_data[train_data[0::, 10] == 'S', 10] = 0
    train_data[train_data[0::, 10] == 'Q', 10] = 1
    #Survived
    train_data[train_data[0::, 3] == 1, 0] = 1
    train_data[train_data[0::, 3] == 0, 0] = -1

    #I need to fill in the gaps of the data and make it complete.
    #So where there is no price, I will assume price on median of that class
    #Where there is no age I will give median of all ages

    imp = preprocessing.Imputer(missing_values=0, strategy='median', axis=0)

    #All the ages with no data make the median of the data
    #train_data[train_data[0::,4] == '',4] = np.median(train_data[train_data[0::,4]\
    #                                          != '',4].astype(np.float))
    #All missing ebmbarks just make them embark from most common place
    #train_data[train_data[0::,10] == '',10] = np.round(np.mean(train_data[train_data[0::,10]\
    #                                                   != '',10].astype(np.float)))

    train_data = np.delete(train_data, [2, 7, 9, 10],
                           1)  #remove the name data, cabin and ticket
    train_data[train_data == ''] = '0'
    imp.fit_transform(train_data)
    #I need to do the same with the test data now so that the columns are in the same
    #as the training data

    #We finally spit the data between train set and valiation set
    x_train, x_test, y_train, y_test = train_test_split(train_data[0::, 1::],
                                                        train_data[0::, 0],
                                                        test_size=0.2,
                                                        random_state=0)

    #Standardise data
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_std = scaler.transform(x_train)
    x_test_std = scaler.transform(x_test)

    test_file_object = csv.reader(open('Data/test.csv',
                                       'rb'))  #Load in the test csv file
    header = test_file_object.next()  #Skip the fist line as it is a header
    test_data = []  #Creat a variable called 'test_data'
    ids = []
    for row in test_file_object:  #Skip through each row in the csv file
        ids.append(row[0])
        test_data.append(row[1:])  #adding each row to the data variable
    test_data = np.array(test_data)  #Then convert from a list to an array

    #I need to convert all strings to integer classifiers:
    #Male = 1, female = 0:
    test_data[test_data[0::, 2] == 'male', 2] = 1
    test_data[test_data[0::, 2] == 'female', 2] = -1
    #ebark c=0, s=1, q=2
    test_data[
        test_data[0::, 9] == 'C',
        9] = -1  #Note this is not ideal, in more complex 3 is not 3 tmes better than 1 than 2 is 2 times better than 1
    test_data[test_data[0::, 9] == 'S', 9] = 0
    test_data[test_data[0::, 9] == 'Q', 9] = 1

    #All the ages with no data make the median of the data
    #test_data[test_data[0::,3] == '',3] = np.median(test_data[test_data[0::,3]\
    #                                           != '',3].astype(np.float))
    #All missing ebmbarks just make them embark from most common place
    #test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\
    #                                                   != '',9].astype(np.float)))
    #All the missing prices assume median of their respectice class
    #for i in xrange(np.size(test_data[0::,0])):
    #    if test_data[i,7] == '':
    #        test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\
    #                                             (test_data[0::,0] == test_data[i,0])\
    #            ,7].astype(np.float))

    test_data = np.delete(test_data, [1, 6, 8, 9],
                          1)  #remove the name data, cabin and ticket
    test_data[test_data == ''] = '0'
    #Impute mising values
    imp.fit_transform(test_data)

    #Standarize
    scaler_test = preprocessing.StandardScaler().fit(test_data)
    test_data_std = scaler_test.transform(test_data)
    #The data is now ready to go. So lets train then test!

    start = time()
    print 'Training estimators'
    estimators = [('linearsvc', LinearSVC()),
                  ('KNeighborsClassifier', KNeighborsClassifier())]
    clf = Pipeline(estimators)
    # specify parameters and distributions to sample from
    param_dist = {
        "linearsvc__C": sp_randint(1, 1000),
        "linearsvc__loss": ["l1", "l2"],
        "linearsvc__dual": [True],
        "KNeighborsClassifier__n_neighbors": sp_randint(5, 100),
        "KNeighborsClassifier__weights": ["uniform", "distance"],
        "KNeighborsClassifier__algorithm": ["ball_tree", "kd_tree", "brute"],
        "KNeighborsClassifier__leaf_size": sp_randint(3, 100),
    }

    # run randomized search
    n_iter_search = 2000
    random_search = RandomizedSearchCV(clf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       n_jobs=4,
                                       verbose=1)
    random_search.fit(x_train_std, y_train)

    print 'Reporting'
    print(
        "RandomizedSearchCV took %.2f seconds for %d candidates"
        " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
    score = random_search.score(x_test_std, y_test)
    print 'Test score'
    print score
    print 'Predicting'
    output = random_search.predict(test_data_std)

    open_file_object = csv.writer(open("pipelinearsvcknn.csv", "wb"))
    open_file_object.writerow(["PassengerId", "Survived"])
    open_file_object.writerows(zip(ids, output))
Example #11
0
# In[8]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                       stratify = y, random_state = random_seed)
print("80%% train: %d/%d, 20%% test: %d/%d" %(X_train.shape[0], X.shape[0], X_test.shape[0], X.shape[0]))


# ### normalize  train data
# 
# fulfill the Na with median, then standardized the data, output type ndarray

# In[9]:


clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler()),])
X_train = clean_pipeline.fit_transform(X_train)
X_test = clean_pipeline.fit_transform(X_test)


# # model selection

# CE without cross validation

# In[10]:


X_train2, X_valid, y_train2, y_valid = train_test_split(X_train, y_train, test_size=0.25, 
                                       stratify = y_train, random_state = random_seed)
Example #12
0
#Concatenation is Bcoz to have same number of rows and columns so that our job will be easy
titanic = pd.concat([titanic_train, titanic_test])
titanic.shape
titanic.info()


#Extract and create title column from name
def extract_title(name):
    return name.split(',')[1].split('.')[0].strip()


#The map(aFunction, aSequence) function applies a passed-in function to each item in an iterable object
#and returns a list containing all the function call results.
titanic['Title'] = titanic['Name'].map(extract_title)

mean_imputer = preprocessing.Imputer(
)  #By defalut parameter is mean and let it use default one.
mean_imputer.fit(titanic_train[['Age', 'Fare']])
#Age is missing in both train and test data.
#Fare is NOT missing in train data but missing test data. Since we are playing on tatanic union data, we are applying mean imputer on Fare as well..
titanic[['Age', 'Fare']] = mean_imputer.transform(titanic[['Age', 'Fare']])


#creaate categorical age column from age
#It's always a good practice to create functions so that the same can be applied on test data as well
def convert_age(age):
    if (age >= 0 and age <= 10):
        return 'Child'
    elif (age <= 25):
        return 'Young'
    elif (age <= 50):
        return 'Middle'
Example #13
0
test_data_no = test_data_S[:, 0]
# test_data_S = test_data_S[:, 1:test_data_S.shape[1]]
test_data_S = append_feature(test_df, istest=True)

print('data split end.', trans_S.shape, trans_T.shape, label_S.shape,
      label_T.shape, test_data_S.shape)

# # 加上和、方差、缺失值数量的特征,效果有所提升
# trans_T = append_feature(trans_T, train_df)
# trans_S = append_feature(trans_S, train_df1)
# test_data_S = append_feature(test_data_S, test_df)
#
# print 'append feature end.', trans_S.shape, trans_T.shape, label_S.shape, label_T.shape, test_data_S.shape

imputer_T = preprocessing.Imputer(missing_values='NaN',
                                  strategy='most_frequent',
                                  axis=0)
imputer_S = preprocessing.Imputer(missing_values='NaN',
                                  strategy='most_frequent',
                                  axis=0)
# imputer_T.fit(trans_T,label_T)
imputer_S.fit(trans_S, label_S)

trans_T = imputer_S.transform(trans_T)
trans_S = imputer_S.transform(trans_S)

test_data_S = imputer_S.transform(test_data_S)

# pca_T = decomposition.PCA(n_components=50)
# pca_S = decomposition.PCA(n_components=50)
#
Example #14
0
from sklearn import preprocessing
import numpy as np

# 결측치에 대한 처리
# 수집한 데이터에는 결측치가 많을 수 있어요.
#   이것에 대한 처리에 대하여 알아 봅시다.
#       preprocessing의 Imputer함수를 이용하여 결측치를 원하는 값으로 설정

x = [[1, 2], [np.nan, 3], [7, 6], [7, 2], [2, 3], [3, 4]]

help(preprocessing.Imputer)

#strategy ==> mean, median, most_frequent
imp = preprocessing.Imputer(missing_values="NaN", strategy="median")
x2 = imp.fit(x).transform(x)
print(x2)
Example #15
0
# Data Preprocessing

# Importing libs
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.preprocessing as sklp
import sklearn.model_selection as sklcv

# Read in data
dataset = pd.read_csv("Data.csv")
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

# Fix missing data issues
imputer = sklp.Imputer(missing_values='NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

# Deal with categorical data
lbEncIv = sklp.LabelEncoder()
X[:,0] = lbEncIv.fit_transform(X[:,0])
oneHotEncIv = sklp.OneHotEncoder(categorical_features = [0])
X = oneHotEncIv.fit_transform(X).toarray()

lbEncDv = sklp.LabelEncoder()
y = lbEncIv.fit_transform(y)

# Split dataset into training and test datasets ot validate ML model
XTrain,XTest,yTrain,yTest = sklcv.train_test_split(X, y, test_size = 0.2, random_state = 0)
            'clarity':[2,('I1','IF','SI1','SI2','VS1','VS2','VVS1','VVS2')],\
            'depth':[0,(40,80)],\
            'table':[0,(40,100)],\
            'x':[0,(0,11)],\
            'y':[0,(0,60)],\
            'z':[0,(0,32)],\
            'price':[0,(300,20000)]\
            }
rie = ReplaceImputeEncode(data_map=data_map, display=True)
df.rie = rie.fit_transform(df)

#Imputing Missing Values
from sklearn import preprocessing
interval_attributes = ['Carat', 'depth', 'table', 'x', 'y', 'z']
interval_data = df.as_matrix(columns=interval_attributes)
interval_imputer = preprocessing.Imputer(strategy='mean')
imputed_interval_data = interval_imputer.fit_transform(interval_data)

print("Imputed Interval Data:\n", imputed_interval_data)

# Convert String Categorical Attribute to Numbers for further assesment
# Mapping of categories to numbers for attribute 'cut'
cut_map = {'Ideal': 0, 'Premium': 1, 'Good': 2, 'Very Good': 3, 'Fair': 4}
df['cut'] = df['cut'].map(cut_map)
# Mapping of categories to numbers for attribute 'color'
color_map = {'E': 0, 'I': 1, 'J': 2, 'H': 3, 'F': 4, 'G': 5, 'D': 6}
df['color'] = df['color'].map(color_map)
# Mapping of categories to numbers for attribute 'clarity'
clarity_map = {
    'SI2': 0,
    'SI': 1,
Example #17
0
print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))

############################################################################
# We can now also inspect the flow object which was automatically created:

flow = openml.flows.get_flow(run.flow_id)
pprint(vars(flow), depth=1)

############################################################################
# It also works with pipelines
# ############################
#
# When you need to handle 'dirty' data, build pipelines to model then automatically.
task = openml.tasks.get_task(115)
pipe = pipeline.Pipeline(
    steps=[('Imputer', preprocessing.Imputer(strategy='median')),
           ('OneHotEncoder',
            preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')
            ), ('Classifier', ensemble.RandomForestClassifier())])
flow = openml.flows.sklearn_to_flow(pipe)

run = openml.runs.run_flow_on_task(flow, task, avoid_duplicate_runs=False)
myrun = run.publish()
print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))

############################################################################
# Challenge
# ^^^^^^^^^
#
# Try to build the best possible models on several OpenML tasks,
# compare your results with the rest of the class and learn from
Example #18
0
def fill_null_columns(request):
    if request.method == 'POST':
        post_data = json.loads(request.POST['data'])
        columns_to_fill = post_data['cols_to_fill']
        strategy = post_data["strategy"]

        try:
            enc = preprocessing.LabelEncoder()
            with tempfile.NamedTemporaryFile(suffix=".csv",
                                             delete=False) as tp:
                csv = request.user.csvfile.read().strip().split("\n")
                header = csv[0].split(",")

                tp.write(','.join(header) + '\n')

                cols_indices = []
                csv.pop(0)

                search_terms = []
                for search_term, cols in columns_to_fill.items():
                    for col in cols:
                        cols_indices.append(header.index(col))
                    if search_term == "empty":
                        search_term = ""
                    search_terms.append(search_term)

                lines_list = []
                null_list = []
                for line in csv:
                    line_list = line.split(",")
                    lines_list.append(line_list)

                    null_list_temp = []
                    for ci in cols_indices:
                        if line_list[ci] in search_terms:
                            null_list_temp.append(np.nan)
                        else:
                            null_list_temp.append(line_list[ci])
                    null_list.append(null_list_temp)

                enc = preprocessing.Imputer(missing_values='NaN',
                                            strategy=strategy)
                imputed_list = list(enc.fit_transform(null_list))

                for i, value in enumerate(imputed_list):
                    for j, c_value in enumerate(value):
                        lines_list[i][cols_indices[j]] = str(c_value)

                for i, line in enumerate(lines_list):
                    tp.write(','.join(line) + '\n')

                tp.flush()
                tp.seek(0)
                request.user.csvfile.delete()
                request.user.csvfile.save('csvfile.csv',
                                          ContentFile(tp.read()))

        except Exception as e:
            print e
            return JsonResponse({"status": "failed", "message": str(e)})

        return JsonResponse({"status": 'success'})
Example #19
0
@author: arellave
"""

#use of inbuilt pipeline.

from sklearn import datasets
import numpy as np
# generate random symmetric psd matrix
mat = datasets.make_spd_matrix(10)
masking_array = np.random.binomial(1, 0.1, mat.shape).astype(bool)
mat[masking_array] = np.nan
print(mat[:4, :4])

#without pipeline
from sklearn import preprocessing
impute = preprocessing.Imputer()
scaler = preprocessing.StandardScaler()
mat_imputed = impute.fit_transform(mat)
print("Imputing :")
print(mat_imputed[:4, :4])

mat_imp_and_scaled = scaler.fit_transform(mat_imputed)
print("Scaling :")
print(mat_imputed[:4, :4])

#using pipeline
from sklearn import pipeline
pipe = pipeline.Pipeline([('impute', impute), ('scaler', scaler)])

print(pipe)
def main():
    # Runtime
    start_time = time.time()

    # read in dataset
    gtd = pd.read_csv("GTD/globalterrorismdb (cleaned).csv", delimiter=",")

    # remove features
    gtd = gtd.drop(['country_txt'], axis=1)
    gtd = gtd.drop(['region_txt'], axis=1)
    gtd = gtd.drop(['attacktype_txt'], axis=1)
    gtd = gtd.drop(['targtype_txt'], axis=1)
    gtd = gtd.drop(['targsubtype_txt'], axis=1)
    gtd = gtd.drop(['weaptype_txt'], axis=1)
    gtd = gtd.drop(['area'], axis=1)
    gtd = gtd.drop(['city'], axis=1)
    gtd = gtd.drop(['property'], axis=1)
    gtd = gtd.drop(['propextent'], axis=1)
    gtd = gtd.drop(['propextent_txt'], axis=1)

    # REMOVED USED TO DROP TO 2 FEATURES
    # ---------------------------------------------------------------
    # Dropped after feature selection
    # gtd = gtd.drop(['nwound'], axis=1)
    # gtd = gtd.drop(['ishostkid'], axis=1)
    # gtd = gtd.drop(['attacktype'], axis=1)
    # gtd = gtd.drop(['nkill'], axis=1)
    # gtd = gtd.drop(['targtype'], axis=1)
    # gtd = gtd.drop(['targsubtype'], axis=1)
    # gtd = gtd.drop(['weaptype'], axis=1)
    # gtd = gtd.drop(['year'], axis=1)
    # gtd = gtd.drop(['success'], axis=1)
    # ---------------------------------------------------------------

    # REMOVED USED IN INTIAL RESULTS ONLY
    # ---------------------------------------------------------------
    # gtd = gtd.drop(['country'], axis=1)
    # gtd = gtd.drop(['region'], axis=1)
    # gtd = gtd.drop(['attacktype'], axis=1)
    # gtd = gtd.drop(['targtype'], axis=1)
    # gtd = gtd.drop(['targsubtype'], axis=1)
    # gtd = gtd.drop(['weaptype'], axis=1)
    # ---------------------------------------------------------------

    # top organisations
    print "\n\nTop organisations"
    print pd.value_counts(gtd['gname'])

    # new dataframe with only selected organisations
    test1 = gtd[gtd.gname == 'Taliban']
    test2 = gtd[gtd.gname == 'Shining Path (SL)']
    test3 = gtd[gtd.gname ==
                'Farabundo Marti National Liberation Front (FMLN)']
    test4 = gtd[gtd.gname == 'Islamic State of Iraq and the Levant (ISIL)']
    test5 = gtd[gtd.gname == 'Irish Republican Army (IRA)']
    test6 = gtd[gtd.gname == 'Revolutionary Armed Forces of Colombia (FARC)']
    test7 = gtd[gtd.gname == 'New People\'s Army (NPA)']
    test8 = gtd[gtd.gname == 'Al-Shabaab']
    test9 = gtd[gtd.gname == 'Basque Fatherland and Freedom (ETA)']
    test10 = gtd[gtd.gname == 'Boko Haram']
    test11 = gtd[gtd.gname == 'Kurdistan Workers\' Party (PKK)']
    test12 = gtd[gtd.gname == 'Communist Party of India - Maoist (CPI-Maoist)']
    test13 = gtd[gtd.gname == 'Liberation Tigers of Tamil Eelam (LTTE)']
    test14 = gtd[gtd.gname == 'National Liberation Army of Colombia (ELN)']
    test15 = gtd[gtd.gname == 'Tehrik-i-Taliban Pakistan (TTP)']
    test16 = gtd[gtd.gname == 'Maoists']
    test17 = gtd[gtd.gname == 'Palestinians']
    test18 = gtd[gtd.gname == 'Nicaraguan Democratic Force (FDN)']
    test19 = gtd[gtd.gname == 'Al-Qaida in the Arabian Peninsula (AQAP)']
    test20 = gtd[gtd.gname == 'Manuel Rodriguez Patriotic Front (FPMR)']
    frames = [
        test1, test2, test3, test4, test5, test6, test7, test8, test9, test10,
        test11, test12, test13, test14, test15, test16, test17, test18, test19,
        test20
    ]
    result = pd.concat(frames)

    # determine number of missing values in each column
    print "\n\nCheck missing values"
    print result.isnull().sum()

    # REMOVE TO RUN WITH 2 FEATURES
    # ---------------------------------------------------------------
    #impute the mean value for all missing values in the nkill, nwound, targsubtype, ishostkid column
    imputer = preprocessing.Imputer(missing_values='NaN',
                                    strategy='mean',
                                    axis=1)

    imputer.fit(result["nkill"])
    newValues1 = imputer.transform(result["nkill"])
    result["nkill"] = newValues1[0]

    imputer.fit(result["targsubtype"])
    newValues2 = imputer.transform(result["targsubtype"])
    result["targsubtype"] = newValues2[0]

    imputer.fit(result["nwound"])
    newValues3 = imputer.transform(result["nwound"])
    result["nwound"] = newValues3[0]

    imputer.fit(result["ishostkid"])
    newValues4 = imputer.transform(result["ishostkid"])
    result["ishostkid"] = newValues4[0]
    # ---------------------------------------------------------------

    print "\n\nCheck if missing values were removed"
    print result.isnull().sum()

    # encode categorical variables as continuous variables
    result['organisation'] = result['gname'].map({
        'Taliban':
        0,
        'Shining Path (SL)':
        1,
        'Farabundo Marti National Liberation Front (FMLN)':
        2,
        'Islamic State of Iraq and the Levant (ISIL)':
        3,
        'Irish Republican Army (IRA)':
        4,
        'Revolutionary Armed Forces of Colombia (FARC)':
        5,
        'New People\'s Army (NPA)':
        6,
        'Al-Shabaab':
        7,
        'Basque Fatherland and Freedom (ETA)':
        8,
        'Boko Haram':
        9,
        'Kurdistan Workers\' Party (PKK)':
        10,
        'Communist Party of India - Maoist (CPI-Maoist)':
        11,
        'Liberation Tigers of Tamil Eelam (LTTE)':
        12,
        'National Liberation Army of Colombia (ELN)':
        13,
        'Tehrik-i-Taliban Pakistan (TTP)':
        14,
        'Maoists':
        15,
        'Palestinians':
        16,
        'Nicaraguan Democratic Force (FDN)':
        17,
        'Al-Qaida in the Arabian Peninsula (AQAP)':
        18,
        'Manuel Rodriguez Patriotic Front (FPMR)':
        19
    }).astype(int)

    result = result.drop(['gname'], axis=1)

    print "\n\nData frame information"
    print result.info()

    # REMOVED USED IN INTIAL RESULTS ONLY
    # ---------------------------------------------------------------
    # perform one-hot encoding on Embarked column
    # result = pd.get_dummies(result, columns=["country_txt"])
    # result = pd.get_dummies(result, columns=["region_txt"])
    # result = pd.get_dummies(result, columns=["area"])
    # result = pd.get_dummies(result, columns=["city"])
    # result = pd.get_dummies(result, columns=["attacktype_txt"])
    # result = pd.get_dummies(result, columns=["targtype_txt"])
    # result = pd.get_dummies(result, columns=["targsubtype_txt"])
    # result = pd.get_dummies(result, columns=["weaptype_txt"])
    # --------------------------------------------------------------

    # Next separate the class data from the training data
    target = result["organisation"]
    data = result.drop(["organisation"], axis=1)

    # REMOVED FEATURE SELECTION
    # ---------------------------------------------------------------
    # Univariate Feature Selection
    # feature_names = list(result.columns.values)
    # Selector_f = SelectPercentile(f_regression, percentile=25)
    # Selector_f.fit(data, target)
    # for n, s in zip(feature_names, Selector_f.scores_):
    #     print 'F Score', s, "for feature", n

    # Tree-based Feature Selection
    # forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
    # forest.fit(data, target)
    # importances = forest.feature_importances_
    # for n, s in zip(feature_names, importances):
    #     print 'F Score', s, "for feature", n
    # ---------------------------------------------------------------

    print "\n\nnumber of features"
    print len(result.columns)
    print "number of rows"
    print result.shape[0]

    # REMOVED USED IN INTIAL RESULTS ONLY
    # ---------------------------------------------------------------
    # print "\n\nRunning classifiers before standardization"
    # runClassifiers(data, target)
    # --------------------------------------------------------------

    # Run standardization on the data
    scalingObj = preprocessing.StandardScaler()
    standardizedData = scalingObj.fit_transform(data)
    data = pd.DataFrame(standardizedData, columns=data.columns)

    # Split the data into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        random_state=0)

    # REMOVED USED AFTER RESULT ON TOP 10 (RESULT kernel = linear, C = 1)
    # Forcing best_params_ to above as I need the estimator object
    # ---------------------------------------------------------------
    # Hyper-parameter optimization on the data.
    print("\n\nRunning hyper-parameter optimization........")
    # param_grid = [{'kernel': ['rbf', 'poly', 'linear'], 'C': range(1, 15)}]
    param_grid = [{'kernel': ['linear'], 'C': range(1, 2)}]
    clf = GridSearchCV(SVC(), param_grid, cv=10)
    clf.fit(data, target)
    print("\n\nBest parameters set found on development set:")
    print(clf.best_params_)
    # ---------------------------------------------------------------

    # Run classifier
    classifier = svm.SVC(kernel=clf.best_params_["kernel"],
                         C=clf.best_params_["C"])
    y_pred = classifier.fit(X_train, y_train).predict(X_test)

    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_pred)

    # Plot non-normalized confusion matrix
    plt.figure()
    class_names = [
        'Taliban', '(SL)', '(FMLN)', 'ISIL)', '(IRA)', '(FARC)', '(NPA)',
        'Al-Shabaab', '(ETA)', 'Boko Haram', '(PKK)', '(CPI-Maoist)', '(LTTE)',
        '(ELN)', '(TTP)', 'Maoists', 'Palestinians', '(FDN)', '(AQAP', '(FPMR)'
    ]
    plot_confusion_matrix(cnf_matrix,
                          classes=class_names,
                          title='Confusion matrix, without normalization')

    plt.show()

    # REMOVED USED IN INTIAL RESULTS ONLY
    # ---------------------------------------------------------------
    # print "\n\nRunning classifiers after standardization"
    # run_classifiers(data, target)
    # ---------------------------------------------------------------

    scores = model_selection.cross_val_score(clf.best_estimator_,
                                             data,
                                             target,
                                             cv=10)
    print "SVM : ", scores.mean()

    # Runtime
    print("--- %s seconds ---" % (time.time() - start_time))
Example #21
0
    def __init__(self, number_of_features, random_seed):
        self.__number_of_features = number_of_features

        # Documentation of function:
        # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html#sklearn.preprocessing.Imputer
        self.__imputer = preprocessing.Imputer(
            missing_values="NaN",
            strategy="median",
            verbose=1,
            axis=0,
            # Probably not necessary, but future-proof. Setting to false may improve
            # performance and reduce memory requirements.
            copy=True)

        # We want to manually control the number of features at each split in order to tune the algorithm for Spectrum
        # data.
        features_at_each_split = int(math.sqrt(number_of_features) + 0)

        # Unfortunately, this library doesn't allow early stopping when the next split has a higher impurity
        # than the current split.
        #
        # Documentation for this function is at:
        # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier.predict_log_proba
        self.__classifier = RandomForestClassifier(
            bootstrap=True,
            class_weight=None,
            criterion='gini',
            # Since early stopping options are limited, we might
            # need to tweak this number for short-term results.
            max_depth=None,
            max_features=features_at_each_split,
            # I don't think setting an arbitrary limit to leaf nodes is a good way to prevent over-
            # splitting, but this could potentially be tuned for early stopping purposes.
            max_leaf_nodes=None,
            # This is the option that lacks the early stopping method I want to use.
            # Since early stopping options are limited, this is another option that can be tuned to
            # avoid too many internal nodes.
            min_impurity_split=None,
            min_impurity_decrease=0.0,
            # 100 seems like a decent sample size, and may be a temporary solution to our early
            # stopping problem. Scratch that. Available data is much lower than previously mentioned.
            min_samples_split=10,
            min_samples_leaf=1,
            # Spectrum and recruiter data isn't weighted, so this should be zero.
            min_weight_fraction_leaf=0.0,
            # Seems like a good average number to start with according to this research paper:
            # https://www.researchgate.net/publication/230766603_How_Many_Trees_in_a_Random_Forest
            n_estimators=96,
            # Make sure this correctly detects the number of cores on the bare-metal
            # production server and runs on all them.
            n_jobs=-1,
            oob_score=False,
            random_state=random_seed,
            verbose=1,
            warm_start=False)

        # A pipeline is necessary when imputing missing values to avoid leaking statistics about the test data into
        # the model when cross-validating.
        #
        # Documentation of function:
        # http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
        pipeline = Pipeline(
            [("imputer", self.__imputer), ("forest", self.__classifier)],
            # Enabling this may improve performance by caching
            memory=None)
        self.__pipeline = pipeline
Example #22
0
# select numerical or categorical columns
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names]


# build the pipeline for the numerical attributes
imputer = preprocessing.Imputer(strategy="median")

num_pipeline = pipeline.Pipeline([
    ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
    ("imputer", preprocessing.Imputer(strategy="median")),
])

num_pipeline.fit_transform(train_data)


#We will also need an imputer for the string categorical columns (the regular Imputer does not work on those):
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(base.BaseEstimator, base.TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series(
            [X[c].value_counts().index[0] for c in X], index=X.columns)
Example #23
0
pd.dropna()
pd.fillna()

# plot
data.boxplot(column = 'finish',by = 'material')

### Preprocessing-------------------------------------------------------------
#Scale
sklp.minmax_scale(data,(0,1)) # data must be numerical pd or np
standardized_Dataset = sklp.scale(Dataset, axis=0)
Normalized_Dataset = sklp.normalize(Dataset, norm='l2')
binarized_Dataset = skp.binarize(Dataset,threshold=0.0)

# Missing data

imp = sklp.Imputer(missing_values=0,strategy='mean',axis=0)
imp.fit_transform(Dataset)

# PCA
import sklearn.decomposition as skd
pca = skd.PCA(n_components=n, whiten=False)
pca.fit(Dataset)
Dataset_Reduced_Dim = pca.transform(Dataset)

# Train and Test
x_train, x_test, y_train, y_test = sklm.train_test_split(x,y,test_size = 0.2)


# Dummy encoding
from statsmodels.tools import categorical
cat_encod = categorical(data, dictnames=False, drop=False) #may need reshape(-1,1)
Example #24
0
def start_preprocessing():
    #loading all the different datasets
    labels = pd.read_csv('data/labels.csv', delimiter=";")
    disc = pd.read_csv('data/discrimination.csv', delimiter=",")
    domestic = pd.read_csv('data/domesticviolence.csv', delimiter=",")
    abortion = pd.read_csv('data/legalabortion.csv', delimiter=",")
    legislation = pd.read_csv('data/legislation.csv', delimiter=",")
    proper = pd.read_csv('data/property.csv', delimiter=",")
    marriage = pd.read_csv('data/marriage.csv', delimiter=",")
    sex = pd.read_csv('data/sexualharassment.csv', delimiter=",")
    gdp = pd.read_csv('data/gdpcap.csv')#, index_col=0)#.iloc[:,-6]#, delimiter=",")
    gini = pd.read_csv('data/gini.csv')
    edu = pd.read_csv('data/edu.csv', delimiter=";")
    agb = pd.read_csv('data/agb.csv', delimiter=";")

    #PREPROCESSING
    #getting rid of countries not present in all sets
    #extracting all the unique countries of in the different sets to compare 
    discunique = disc.iloc[:,0].unique()
    labelsunique = labels.iloc[:,0].unique()
    uniquedomestic = domestic.iloc[:,0].unique()
    uniquelegislation = legislation.iloc[:,0].unique()
    uniqueproper = proper.iloc[:,0].unique()
    uniquemarriage = marriage.iloc[:,0].unique()
    uniquesex = sex.iloc[:,0].unique()
    uniquegdp = gdp.iloc[:,0].unique()
    uniquegini = gini.iloc[:,0].unique()
    uniqueedu = edu.iloc[:,0].unique()
    uniqueagb = agb.iloc[:,0].unique()

    #discarding examples in the discrimination data set that are not present in the label set
    discarddisc = []
    for i in discunique:
        if i not in labelsunique:
            discarddisc.append(i)
            disc.drop(disc[disc.Economy == i].index[0], inplace=True)
            
    discarddom = []
    for i in uniquedomestic:
        if i not in labelsunique:
            discarddom.append(i)
            domestic.drop(domestic[domestic.Economy == i].index[0], inplace=True)

    discardleg = []
    for i in uniquelegislation:
        if i not in labelsunique:
            discardleg.append(i)
            legislation.drop(legislation[legislation.Economy == i].index[0], inplace=True)
            
    discardprop = []
    for i in uniqueproper:
        if i not in labelsunique:
            discardprop.append(i)
            proper.drop(proper[proper.Economy == i].index[0], inplace=True)

    discardmar = []
    for i in uniquemarriage:
        if i not in labelsunique:
            discardmar.append(i)
            marriage.drop(marriage[marriage.Economy == i].index[0], inplace=True)
            
    discardsex = []
    for i in uniquesex:
        if i not in labelsunique:
            discardsex.append(i)
            sex.drop(sex[sex.Economy == i].index[0], inplace=True)

    countriessex = sex.iloc[:,0].unique()
    discardgdp = []
    for i in uniquegdp:
        if i not in countriessex:
            discardgdp.append(i)
            gdp.drop(gdp[gdp.iloc[:,0] == i].index[0], inplace=True)
            
    discardgini = []
    for i in uniquegini:
        if i not in countriessex:
            discardgini.append(i)
            gini.drop(gini[gini.iloc[:,0] == i].index[0], inplace=True)
            
    discardedu = []
    for i in uniqueedu:
        if i not in countriessex:
            discardedu.append(i)
            edu.drop(edu[edu.iloc[:,0] == i].index[0], inplace=True)
            
    discardagb = []
    for i in uniqueagb:
        if i not in countriessex:
            discardagb.append(i)
            agb.drop(agb[agb.iloc[:,0] == i].index[0], inplace=True)


    neunique = sex.iloc[:,0].unique()
    labeldel = []
    for i in labelsunique:
        if i not in neunique:
            labeldel.append(i)
            labels.drop(labels[labels.Country == i].index[0], inplace=True)

    uniqueabortion = abortion.iloc[:,0].unique()
    labelsunique = labels.iloc[:,0].unique()
    discardabor = []
    for i in uniqueabortion:
        if i not in labelsunique:
            discardabor.append(i)
            abortion.drop(abortion[abortion.COUNTRY == i].index[0], inplace=True)


    gdptrim = gdp.iloc[:,[0,-6]] # extracting the gdppercap 2013 columns, and the corresponding country
    ginitrim = gini.iloc[:,[0,-6]] # extracting the gini 2013 columns, and the corresponding country
    gdptrim.is_copy = False
    ginitrim.is_copy = False 
    #adding generic name "Economy" 
    gdptrim.rename(columns={'Country Name': 'Economy'}, inplace=True)
    ginitrim.rename(columns={'Country Name': 'Economy'}, inplace=True)
    abortion.rename(columns={'COUNTRY': 'Economy'}, inplace=True)

    #filling in nan values for gini
    means = {} #creating dictionary to hold mean gini value of 7 different regions 
    temp = list(sex.RegionName.unique()) #extracting the 7 different regions 

    for i in temp: #iterating over the the regions
        countries = list(sex[sex.iloc[:,1] == i].iloc[:,0]) #extracting all the countries that belong to this region
        col_gini = 0 #initiating int to hold cumulated gini coefficient for region
        k = 0 #counting the number of countries that contribute to the mean
        for country in countries: #iterating over the extracted countries
            gini = ginitrim[ginitrim.iloc[:,0] == country].iloc[0][1] #extracting gini for country

            if np.isnan(gini) == False: #if gini is not nan 
                k +=1 #increment k
                col_gini += gini #add to col_gini
        means[i] = col_gini/k #assign mean gini value for region to dictionary

    countries = ginitrim[np.isnan(ginitrim.iloc[:,1])].iloc[:,0] #extracting NaN Gini countries
    for i in countries:#iterating over the countries that has NaN values
        index = ginitrim[ginitrim.iloc[:,0] == i].index[0] #getting the index
        value = means[sex[sex.iloc[:,0] == i].iloc[:,1][sex[sex.iloc[:,0] == i].iloc[:,1].index[0]]] #getting value
        ginitrim.set_value(index, '2013', value) #inputting mean value to frame

    #mergin all sets into one dataframe
    main = pd.DataFrame.merge(disc, domestic, how='outer')
    main = pd.DataFrame.merge(main, legislation, how='outer')
    main = pd.DataFrame.merge(main, proper, how='outer')
    main = pd.DataFrame.merge(main, marriage, how='outer')
    main = pd.DataFrame.merge(main, sex, how='outer')
    main = pd.DataFrame.merge(main, abortion, how='outer')
    main = pd.DataFrame.merge(main, gdptrim, how='outer')
    main = pd.DataFrame.merge(main, ginitrim, on='Economy')
    main = pd.DataFrame.merge(main, edu, on='Economy')
    main = pd.DataFrame.merge(main, agb, on='Economy')
    #setting index of main frame to Economy
    main = main.set_index('Economy')

    #bulding a dataset of only women rights
    copy = main.iloc[:,:-4].copy() #temporary copy frame excluded the continuous values
    copy = copy.drop('RegionName', axis=1) #dropping the geographical feature RegionName from rights set
    rightsatt = list(copy.columns) #Extracting a list containing feature names, for plotting and investigation purposes 
    rights = np.zeros(copy.shape) #creating a numpy matrix to hold binary values 
    for idx, i in enumerate(copy.itertuples()): #iterating over copy frame
        for indel, item in enumerate(i): #iterating over columns 
            if indel != 0: #if it's not the first column, i.e. economy name
                if item == 'Yes': 
                    rights[idx, indel-1] = 1 #1 if yes to question
                elif item == 'No':
                    rights[idx, indel-1] = 0 #0 if no to question
                else:
                    rights[idx, indel-1] = np.nan   #nan if empty


    copy = main.copy() #new temp frame that contains all features 
    copy = copy.drop('RegionName', axis=1) #deleting RegionName feature
    x = np.zeros((119, copy.shape[1] + 7)) # creating a numpy matrix to contain all features plus regionnames
    allatt = list(copy.columns) #extracting the list of attributes 

    for i in list(main.RegionName.unique()): #appending the region names at the end of the attributes list
        allatt.append(i) 

    regions = list(main.RegionName.unique()) #extracting region names 

    for idx, i in enumerate(copy.itertuples()): #iterating over copy frame
        for indel, item in enumerate(i): #iterating over columns
            if indel == 0: #using Economy name to get region
                x[idx, 42 + regions.index(main.RegionName[item])] = 1 #dependent on the region's index in the region list
            elif indel == 39: #if the indel is GDP per cap
                if idx == 105: #if the country is 105: Syria 
                    x[idx, indel-1] = 35164000000 / 19810000 #manually inputting the GPDpercap for Syria
                else:
                    x[idx, indel-1] = item #otherwise input gdppercap for country
            elif indel == 40: #if the indel is gini 
                x[idx, indel-1] = item
            elif indel == 41: #if the indel is education
                x[idx, indel-1] = item
            elif indel == 42: #if indel is age at first birth
                x[idx, indel-1] = item
            
            else: #the feature is binary
                if item == 'Yes': 
                    x[idx, indel-1] = 1
                elif item == 'No':
                    x[idx, indel-1] = 0
                else:
                    x[idx, indel-1] = np.nan

    #changing name in attribute list
    allatt[allatt.index('2013_x')] = 'GDP per Cap' 
    allatt[allatt.index('2013_y')] = 'GINI'
    allatt[allatt.index('2013')] = 'Education'

    #deleting excess columns in labels data 
    del labels['ISO3']
    del labels['2013 Rank']
    labels = labels.set_index('Country') #setting the country as index
    labels = labels.sort_index() #sorting to correspond sequence in x and rights 


    #fitting an imputer, to replace NaNs with mode: a total of 30 countries have 15 or 16 nans 
    rightsprep = preprocessing.Imputer(axis=0, strategy="most_frequent").fit(rights)
    rights = rightsprep.transform(rights)

    #fitting an impute to replace NaNs with mode in complete set 
    xprep = preprocessing.Imputer(axis=0, strategy="most_frequent").fit(x)
    x = xprep.transform(x)
    return rights, rightsatt, x, allatt, labels, main
titanic_test = pd.read_csv("test.csv")

#Note that you have to do the same work on test as well
#EDA
titanic_test.shape
titanic_test.info()

titanic_test1 = pd.get_dummies(titanic_test,
                               columns=['Pclass', 'Sex', 'Embarked'])
titanic_test1.shape
titanic_test1.info()
titanic_test1.head(6)
titanic_test1.describe()

X_test = titanic_test1.drop(['PassengerId', 'Age', 'Cabin', 'Ticket', 'Name'],
                            1)
X_test.info()

#create an instance of Imputer class with required arguments
mean_imputer = preprocessing.Imputer()  #Default value for Imputer is mean
#compute mean of age and fare respectively
mean_imputer.fit(X_test[['Fare']])
#fill up the missing data with the computed means
X_test[['Fare']] = mean_imputer.transform(X_test[['Fare']])

titanic_test['Survived'] = lr_grid_estimator.predict(X_test)

titanic_test.to_csv('submission.csv',
                    columns=['PassengerId', 'Survived'],
                    index=False)
titanic_test.shape
titanic_test.info()

#merge train and test data
titanic_all = pd.concat([titanic_train, titanic_test], axis=0)
titanic_all.shape
titanic_all.info()

#explore missing data
titanic_all.apply(lambda x: sum(x.isnull()))

#pre-process Embarked
titanic_all.Embarked[titanic_all['Embarked'].isnull()] = 'S'

#pre-process Age
age_imputer = preprocessing.Imputer()
titanic_all[['Age']] = age_imputer.fit_transform(titanic_all[['Age']])


#create family size feature
def size_to_type(x):
    if (x == 1):
        return 'Single'
    elif (x >= 2 and x <= 4):
        return 'Small'
    else:
        return 'Large'


titanic_all['FamilySize'] = titanic_all.SibSp + titanic_all.Parch + 1
titanic_all['FamilyType'] = titanic_all['FamilySize'].map(size_to_type)
    # 1.1.2 区间缩放:将特征值缩放到[0, 1]区间的数据(对列向量处理)
    features_new = preprocessing.MinMaxScaler().fit_transform(features)
    # 1.1.3 归一化:将行向量转化为“单位向量”(对每个样本处理)
    features_new = preprocessing.Normalizer().fit_transform(features)

    # 1.2 对定量特征二值化:设定一个阈值,大于阈值的赋值为1,小于等于阈值的赋值为0
    features_new = preprocessing.Binarizer(threshold=3).fit_transform(features)

    # 1.3 对定性(分类)特征编码(也可用pandas.get_dummies函数)
    enc = preprocessing.OneHotEncoder()
    enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
    # print(enc.transform([[0, 1, 3]]))
    # print(enc.transform([[0, 1, 3]]).toarray())

    # 1.4 缺失值计算(也可用pandas.fillna函数)
    imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
    features_new = imp.fit_transform(
        vstack((array([nan, nan, nan, nan]), features)))

    # 1.5 数据变换
    # 1.5.1 基于多项式变换(对行变量处理)
    features_new = preprocessing.PolynomialFeatures().fit_transform(features)
    # 1.5.2 基于自定义函数变换,以log函数为例
    features_new = preprocessing.FunctionTransformer(
        np.log1p).fit_transform(features)
    '''
    2.特征选择
    '''
    # 2.1 Filter
    # 2.1.1 方差选择法,选择方差大于阈值的特征
    features_new = feature_selection.VarianceThreshold(
Example #28
0
def impute_continuous_features(df, features):
    cont_imputer = preprocessing.Imputer()
    cont_imputer.fit(df[features])
    print(cont_imputer.statistics_)
    df[features] = cont_imputer.transform(df[features])
Example #29
0
algorithms={
    'Regression': ['All','ExtraTreeRegressor','GradientBoostingRegressor','DecisionTreeRegressor','LinearSVR',\
        'RandomForestRegressor','XGBRegressor','KNeighborsRegressor','LinearRegression'],
    'Classification': ['All','DecisionTreeClassifier','ExtraTreesClassifier','RandomForestClassifier','GradientBoostingClassifier',\
        'KNeighborsClassifier','LinearSVC','LogisticRegression','XGBClassifier']
}

optionsForDropdown={
      "changedataTypes": ['None',"Continuous", "Categorical"],
      "imputation_methods": ['None',"Mean", "Median", "Mode", "Back fill", "Forward fill"],
      "data_transformation_steps": ["None", "One Hot Encoding", "Label Encoding", "Normalize", "Scaling Standard", "Scaling Min Max", "Scaling Max Absolute"],
      "algorithmTypes":algorithms
    }

processe_short={'Mean':preprocessing.Imputer(strategy="mean"),
    'Median':preprocessing.Imputer(strategy="median"),
    'Mode':preprocessing.Imputer(strategy="mode"),
    'Scaling Min Max':preprocessing.MinMaxScaler(),
    'Scaling Standard':preprocessing.StandardScaler(),
    'Label Encoding':preprocessing.LabelEncoder(),
    'One Hot Encoding':preprocessing.LabelBinarizer(),
    'Normalize':preprocessing.StandardScaler(),
    'Scaling Max Absolute':preprocessing.MinMaxScaler(),}

class AutoMLUtilities:


    def dataDescription(self,data):
        dataDtype=dict(data.dtypes)
        dataMissingVal=dict(data.isnull().sum())
Example #30
0
 def __init__(self, n, d):
     self.transformer_ = preprocessing.Imputer(strategy='most_frequent')
     self.param_grid_ = {}