filename = '{}_{}.csv'.format(config["experiment_name"], name) try: #Try to convert to dataframe, it will fail if data is empty df = data.to_df() df.to_csv(os.path.join(path_to_dumps, filename)) except Exception, e: logger.info('Error saving {} as csv: {}'.format(filename, e)) else: logger.info('{} is None, skipping dump...'.format(name)) #Impute missing values (mean is the only strategy for now) # Note that the features can specify imputation strategies; # and if they don't, then they already got a default imputer, # which imputes the median (for floats), or 0 (for integers) logger.info('Imputing values on train and test...') imputer = preprocessing.Imputer().fit(train.x) train.x = imputer.transform(train.x) test.x = imputer.transform(test.x) logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape, test.x.shape)) if args.predicttop: preds.x = imputer.transform(preds.x) logger.debug('Prediction x shape: {}'.format(preds.x.shape)) # Scale features to zero mean and unit variance logger.info('Scaling train, test...') scaler = preprocessing.StandardScaler().fit(train.x) train.x = scaler.transform(train.x) test.x = scaler.transform(test.x) logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape, test.x.shape))
indices = np.argsort(classifier.feature_importances_)[::-1][:40] g = sns.barplot(y=X_train.columns[indices][:40], x=classifier.feature_importances_[indices][:40], orient='h') g.set_xlabel("Relative importance", fontsize=12) g.set_ylabel("Features", fontsize=12) g.tick_params(labelsize=9) g.set_title("DT feature importances") titanic_train = pd.read_csv("C:/Users/Algorithmica/Downloads/all/train.csv") print(titanic_train.shape) print(titanic_train.info()) imputable_cont_features = ['Age', 'Fare'] cont_imputer = preprocessing.Imputer() cont_imputer.fit(titanic_train[imputable_cont_features]) print(cont_imputer.statistics_) titanic_train[imputable_cont_features] = cont_imputer.transform( titanic_train[imputable_cont_features]) #impute missing values for categorical features cat_imputer = CategoricalImputer() cat_imputer.fit(titanic_train['Embarked']) print(cat_imputer.fill_) titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked']) #creaate categorical age column from age def convert_age(age): if (age >= 0 and age <= 18):
titanic_train = pd.read_csv('train.csv') titanic_test = pd.read_csv('test.csv') titanic_train.info() titanic_test['Survived'] = None titanic_test.info() titanic_test.shape[0] titanicAll = pd.concat([titanic_train, titanic_test]) titanicAll.info() #EDA titanicAll.shape titanicAll.info #create an instance of Imputer class with required arguments mean_imputer = preprocessing.Imputer() #compute mean of age and fare respectively mean_imputer.fit(titanic_train[['Age', 'Fare']]) #fill up the missing data with the computed means titanicAll[['Age', 'Fare']] = mean_imputer.transform(titanicAll[['Age', 'Fare']]) #Feature Considered Till now Age, Fare, Survived #Feature Creation: Creating new feature with Age column to see visualization. To find differences in age groups. def ageRange(age): ageRange = '' if age < 16: ageRange = 'Child' elif age <= 30: ageRange = 'Young'
import numpy as np import pandas as pd import sklearn as sk import sklearn.preprocessing as prepross import sklearn.model_selection._split as split dataset = pd.read_csv( 'C:\DOC\Workspace\Machine Learning A-Z Template Folder\Part 1 - Data Preprocessing\Data.csv' ) x = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # Handle missing data imputer = prepross.Imputer() x[:, 1:3] = imputer.fit_transform(x[:, 1:3]) # Encoding categorical data label_encoder = prepross.LabelEncoder() x[:, 0] = label_encoder.fit_transform(x[:, 0]) y = label_encoder.fit_transform(y) # One hot one_hot_encoder_x = prepross.OneHotEncoder(categorical_features=[0]) x = one_hot_encoder_x.fit_transform(x).toarray() # Remove first column(categorical data trap) x = np.delete(x, 0, axis=1) # Split test - train
# In[7]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_seed) # ### normalize train data # # fulfill the Na with median, then standardized the data, output type ndarray # In[8]: clean_pipeline = Pipeline([ ('imputer', preprocessing.Imputer(missing_values='NaN', strategy="median")), ('std_scaler', preprocessing.StandardScaler()), ]) X_train = clean_pipeline.fit_transform(X_train) X_test = clean_pipeline.fit_transform(X_test) # ## TEST CE # In[9]: X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, stratify=y_train, random_state=random_seed)
from sklearn import preprocessing os.chdir('D:\Projects\datasets') #read and explore data titanic_train = pd.read_csv('titanic_train.csv') titanic_train.shape titanic_train.info() #create title column from name def extract_title(name): return name.split(',')[1].split('.')[0].strip() titanic_train['Title'] = titanic_train['Name'].map(extract_title) sns.factorplot(x="Title", hue="Survived", data=titanic_train, kind="count", size=6) age_imputer = preprocessing.Imputer() age_imputer.fit(titanic_train[['Age']]) titanic_train[['Age']] = age_imputer.transform(titanic_train[['Age']]) #creaate categorical age column from age def convert_age(age): if(age >= 0 and age <= 10): return 'Child' elif(age <= 25): return 'Young' elif(age <= 50): return 'Middle' else: return 'Old' titanic_train['Age1'] = titanic_train['Age'].map(convert_age) sns.factorplot(x="Age1", hue="Survived", data=titanic_train, kind="count", size=6)
header=None) #header none because no column names dframe.info() numdframe = dframe.iloc[:, 1:] catdframe = dframe.iloc[:, 0] catdf_encod = categorical(catdframe.values, dictnames=False, drop=True) numArr = np.asarray(numdframe.values) catArr = np.asarray(catdf_encod) Output = numArr[:, 5] Inp_num = numArr[:, 0:5] Input = np.concatenate((Inp_num, catArr), axis=1) Input = np.c_[catArr, Inp_num] print(Input.shape) ####Q1 (b)######## imp = skp.Imputer(missing_values='NaN', strategy='most_frequent', axis=0) Input_new = imp.fit_transform(Input) ####Q1 (c)###### X_train, X_test, y_train, y_test = skms.train_test_split(Input_new, Output, test_size=0.25, random_state=111) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) print(Input) ##### Q1 (d)###### svc_rbf = SVC(kernel='rbf', gamma='auto',
# values in A1: # A1: b, a # # Use the input by mean number approach to input the missing # values in A2: # A2: continuous crx_data = pd.read_csv("crx.data", header=None) # Since the Japanese Credit Data Set uses "?" to denote missing, # replace it to np.nan. scikit-learn"s Imputer only accepts np.nan # or integer, therefore, convert "?" to np.nan. # This transformation is for A2 which uses scikit-learn"s Imputer. # For A1 which uses imputer_by_most_frequent(), this transformation # is not necessary. crx_data.replace("?", np.nan, inplace=True) A1_no_missing = imputer_by_most_frequent(np.nan, crx_data.iloc[:, 0].values) print(A1_no_missing) # Use scikit-learn Imputer to input missing values by mean number. # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html imputer = preprocessing.Imputer(missing_values=np.nan, strategy="mean", axis=0) # Convert to two-dimension list, since Imputer only accepts # two dimensions list. A2_two_d = np.array([[item] for item in crx_data.iloc[:, 1].values]) A2_no_missing = imputer.fit_transform(A2_two_d) print(A2_no_missing)
chr1.read() read_time = time.time() - start_time hour, minute, second = pr.time_process(read_time) print '\n' print 'Loading time: ' + str(hour) + "h " + str(minute) + "m " + str( second) + "s " start_time = time.time() chr1.data_extract(strand_binary=True, pos_normalize=True) from sklearn import preprocessing imputer = preprocessing.Imputer(copy=False) imputer.fit_transform(chr1.train_beta) process_time = time.time() - start_time hour, minute, second = pr.time_process(process_time) print '\n' print 'Processing time: ' + str(hour) + "h " + str(minute) + "m " + str( second) + "s " train_beta_mean = np.mean(chr1.train_beta, axis=1) predict = train_beta_mean[chr1.sample_nan] start_time = time.time() # Normalized square error for prediction test_not_nan = []
def main(): csv_file_object = csv.reader(open('Data/train.csv', 'rb')) #Load in the training csv file header = csv_file_object.next() #Skip the fist line as it is a header train_data = [] #Creat a variable called 'train_data' for row in csv_file_object: #Skip through each row in the csv file train_data.append(row[1:]) #adding each row to the data variable train_data = np.array(train_data) #Then convert from a list to an array #I need to convert all strings to integer classifiers: #Male = 1, female = 0: train_data[train_data[0::, 3] == 'male', 3] = -1 train_data[train_data[0::, 3] == 'female', 3] = 1 #embark c=0, s=1, q=2 train_data[train_data[0::, 10] == 'C', 10] = -1 train_data[train_data[0::, 10] == 'S', 10] = 0 train_data[train_data[0::, 10] == 'Q', 10] = 1 #Survived train_data[train_data[0::, 3] == 1, 0] = 1 train_data[train_data[0::, 3] == 0, 0] = -1 #I need to fill in the gaps of the data and make it complete. #So where there is no price, I will assume price on median of that class #Where there is no age I will give median of all ages imp = preprocessing.Imputer(missing_values=0, strategy='median', axis=0) #All the ages with no data make the median of the data #train_data[train_data[0::,4] == '',4] = np.median(train_data[train_data[0::,4]\ # != '',4].astype(np.float)) #All missing ebmbarks just make them embark from most common place #train_data[train_data[0::,10] == '',10] = np.round(np.mean(train_data[train_data[0::,10]\ # != '',10].astype(np.float))) train_data = np.delete(train_data, [2, 7, 9, 10], 1) #remove the name data, cabin and ticket train_data[train_data == ''] = '0' imp.fit_transform(train_data) #I need to do the same with the test data now so that the columns are in the same #as the training data #We finally spit the data between train set and valiation set x_train, x_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0], test_size=0.2, random_state=0) #Standardise data scaler = preprocessing.StandardScaler().fit(x_train) x_train_std = scaler.transform(x_train) x_test_std = scaler.transform(x_test) test_file_object = csv.reader(open('Data/test.csv', 'rb')) #Load in the test csv file header = test_file_object.next() #Skip the fist line as it is a header test_data = [] #Creat a variable called 'test_data' ids = [] for row in test_file_object: #Skip through each row in the csv file ids.append(row[0]) test_data.append(row[1:]) #adding each row to the data variable test_data = np.array(test_data) #Then convert from a list to an array #I need to convert all strings to integer classifiers: #Male = 1, female = 0: test_data[test_data[0::, 2] == 'male', 2] = 1 test_data[test_data[0::, 2] == 'female', 2] = -1 #ebark c=0, s=1, q=2 test_data[ test_data[0::, 9] == 'C', 9] = -1 #Note this is not ideal, in more complex 3 is not 3 tmes better than 1 than 2 is 2 times better than 1 test_data[test_data[0::, 9] == 'S', 9] = 0 test_data[test_data[0::, 9] == 'Q', 9] = 1 #All the ages with no data make the median of the data #test_data[test_data[0::,3] == '',3] = np.median(test_data[test_data[0::,3]\ # != '',3].astype(np.float)) #All missing ebmbarks just make them embark from most common place #test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\ # != '',9].astype(np.float))) #All the missing prices assume median of their respectice class #for i in xrange(np.size(test_data[0::,0])): # if test_data[i,7] == '': # test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\ # (test_data[0::,0] == test_data[i,0])\ # ,7].astype(np.float)) test_data = np.delete(test_data, [1, 6, 8, 9], 1) #remove the name data, cabin and ticket test_data[test_data == ''] = '0' #Impute mising values imp.fit_transform(test_data) #Standarize scaler_test = preprocessing.StandardScaler().fit(test_data) test_data_std = scaler_test.transform(test_data) #The data is now ready to go. So lets train then test! start = time() print 'Training estimators' estimators = [('linearsvc', LinearSVC()), ('KNeighborsClassifier', KNeighborsClassifier())] clf = Pipeline(estimators) # specify parameters and distributions to sample from param_dist = { "linearsvc__C": sp_randint(1, 1000), "linearsvc__loss": ["l1", "l2"], "linearsvc__dual": [True], "KNeighborsClassifier__n_neighbors": sp_randint(5, 100), "KNeighborsClassifier__weights": ["uniform", "distance"], "KNeighborsClassifier__algorithm": ["ball_tree", "kd_tree", "brute"], "KNeighborsClassifier__leaf_size": sp_randint(3, 100), } # run randomized search n_iter_search = 2000 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, n_jobs=4, verbose=1) random_search.fit(x_train_std, y_train) print 'Reporting' print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) score = random_search.score(x_test_std, y_test) print 'Test score' print score print 'Predicting' output = random_search.predict(test_data_std) open_file_object = csv.writer(open("pipelinearsvcknn.csv", "wb")) open_file_object.writerow(["PassengerId", "Survived"]) open_file_object.writerows(zip(ids, output))
# In[8]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state = random_seed) print("80%% train: %d/%d, 20%% test: %d/%d" %(X_train.shape[0], X.shape[0], X_test.shape[0], X.shape[0])) # ### normalize train data # # fulfill the Na with median, then standardized the data, output type ndarray # In[9]: clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")), ('std_scaler', preprocessing.StandardScaler()),]) X_train = clean_pipeline.fit_transform(X_train) X_test = clean_pipeline.fit_transform(X_test) # # model selection # CE without cross validation # In[10]: X_train2, X_valid, y_train2, y_valid = train_test_split(X_train, y_train, test_size=0.25, stratify = y_train, random_state = random_seed)
#Concatenation is Bcoz to have same number of rows and columns so that our job will be easy titanic = pd.concat([titanic_train, titanic_test]) titanic.shape titanic.info() #Extract and create title column from name def extract_title(name): return name.split(',')[1].split('.')[0].strip() #The map(aFunction, aSequence) function applies a passed-in function to each item in an iterable object #and returns a list containing all the function call results. titanic['Title'] = titanic['Name'].map(extract_title) mean_imputer = preprocessing.Imputer( ) #By defalut parameter is mean and let it use default one. mean_imputer.fit(titanic_train[['Age', 'Fare']]) #Age is missing in both train and test data. #Fare is NOT missing in train data but missing test data. Since we are playing on tatanic union data, we are applying mean imputer on Fare as well.. titanic[['Age', 'Fare']] = mean_imputer.transform(titanic[['Age', 'Fare']]) #creaate categorical age column from age #It's always a good practice to create functions so that the same can be applied on test data as well def convert_age(age): if (age >= 0 and age <= 10): return 'Child' elif (age <= 25): return 'Young' elif (age <= 50): return 'Middle'
test_data_no = test_data_S[:, 0] # test_data_S = test_data_S[:, 1:test_data_S.shape[1]] test_data_S = append_feature(test_df, istest=True) print('data split end.', trans_S.shape, trans_T.shape, label_S.shape, label_T.shape, test_data_S.shape) # # 加上和、方差、缺失值数量的特征,效果有所提升 # trans_T = append_feature(trans_T, train_df) # trans_S = append_feature(trans_S, train_df1) # test_data_S = append_feature(test_data_S, test_df) # # print 'append feature end.', trans_S.shape, trans_T.shape, label_S.shape, label_T.shape, test_data_S.shape imputer_T = preprocessing.Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imputer_S = preprocessing.Imputer(missing_values='NaN', strategy='most_frequent', axis=0) # imputer_T.fit(trans_T,label_T) imputer_S.fit(trans_S, label_S) trans_T = imputer_S.transform(trans_T) trans_S = imputer_S.transform(trans_S) test_data_S = imputer_S.transform(test_data_S) # pca_T = decomposition.PCA(n_components=50) # pca_S = decomposition.PCA(n_components=50) #
from sklearn import preprocessing import numpy as np # 결측치에 대한 처리 # 수집한 데이터에는 결측치가 많을 수 있어요. # 이것에 대한 처리에 대하여 알아 봅시다. # preprocessing의 Imputer함수를 이용하여 결측치를 원하는 값으로 설정 x = [[1, 2], [np.nan, 3], [7, 6], [7, 2], [2, 3], [3, 4]] help(preprocessing.Imputer) #strategy ==> mean, median, most_frequent imp = preprocessing.Imputer(missing_values="NaN", strategy="median") x2 = imp.fit(x).transform(x) print(x2)
# Data Preprocessing # Importing libs import numpy as np import matplotlib.pyplot as plt import pandas as pd import sklearn.preprocessing as sklp import sklearn.model_selection as sklcv # Read in data dataset = pd.read_csv("Data.csv") X = dataset.iloc[:,:-1].values y = dataset.iloc[:,-1].values # Fix missing data issues imputer = sklp.Imputer(missing_values='NaN', strategy = 'mean', axis = 0) imputer = imputer.fit(X[:,1:3]) X[:,1:3] = imputer.transform(X[:,1:3]) # Deal with categorical data lbEncIv = sklp.LabelEncoder() X[:,0] = lbEncIv.fit_transform(X[:,0]) oneHotEncIv = sklp.OneHotEncoder(categorical_features = [0]) X = oneHotEncIv.fit_transform(X).toarray() lbEncDv = sklp.LabelEncoder() y = lbEncIv.fit_transform(y) # Split dataset into training and test datasets ot validate ML model XTrain,XTest,yTrain,yTest = sklcv.train_test_split(X, y, test_size = 0.2, random_state = 0)
'clarity':[2,('I1','IF','SI1','SI2','VS1','VS2','VVS1','VVS2')],\ 'depth':[0,(40,80)],\ 'table':[0,(40,100)],\ 'x':[0,(0,11)],\ 'y':[0,(0,60)],\ 'z':[0,(0,32)],\ 'price':[0,(300,20000)]\ } rie = ReplaceImputeEncode(data_map=data_map, display=True) df.rie = rie.fit_transform(df) #Imputing Missing Values from sklearn import preprocessing interval_attributes = ['Carat', 'depth', 'table', 'x', 'y', 'z'] interval_data = df.as_matrix(columns=interval_attributes) interval_imputer = preprocessing.Imputer(strategy='mean') imputed_interval_data = interval_imputer.fit_transform(interval_data) print("Imputed Interval Data:\n", imputed_interval_data) # Convert String Categorical Attribute to Numbers for further assesment # Mapping of categories to numbers for attribute 'cut' cut_map = {'Ideal': 0, 'Premium': 1, 'Good': 2, 'Very Good': 3, 'Fair': 4} df['cut'] = df['cut'].map(cut_map) # Mapping of categories to numbers for attribute 'color' color_map = {'E': 0, 'I': 1, 'J': 2, 'H': 3, 'F': 4, 'G': 5, 'D': 6} df['color'] = df['color'].map(color_map) # Mapping of categories to numbers for attribute 'clarity' clarity_map = { 'SI2': 0, 'SI': 1,
print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id)) ############################################################################ # We can now also inspect the flow object which was automatically created: flow = openml.flows.get_flow(run.flow_id) pprint(vars(flow), depth=1) ############################################################################ # It also works with pipelines # ############################ # # When you need to handle 'dirty' data, build pipelines to model then automatically. task = openml.tasks.get_task(115) pipe = pipeline.Pipeline( steps=[('Imputer', preprocessing.Imputer(strategy='median')), ('OneHotEncoder', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore') ), ('Classifier', ensemble.RandomForestClassifier())]) flow = openml.flows.sklearn_to_flow(pipe) run = openml.runs.run_flow_on_task(flow, task, avoid_duplicate_runs=False) myrun = run.publish() print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id)) ############################################################################ # Challenge # ^^^^^^^^^ # # Try to build the best possible models on several OpenML tasks, # compare your results with the rest of the class and learn from
def fill_null_columns(request): if request.method == 'POST': post_data = json.loads(request.POST['data']) columns_to_fill = post_data['cols_to_fill'] strategy = post_data["strategy"] try: enc = preprocessing.LabelEncoder() with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tp: csv = request.user.csvfile.read().strip().split("\n") header = csv[0].split(",") tp.write(','.join(header) + '\n') cols_indices = [] csv.pop(0) search_terms = [] for search_term, cols in columns_to_fill.items(): for col in cols: cols_indices.append(header.index(col)) if search_term == "empty": search_term = "" search_terms.append(search_term) lines_list = [] null_list = [] for line in csv: line_list = line.split(",") lines_list.append(line_list) null_list_temp = [] for ci in cols_indices: if line_list[ci] in search_terms: null_list_temp.append(np.nan) else: null_list_temp.append(line_list[ci]) null_list.append(null_list_temp) enc = preprocessing.Imputer(missing_values='NaN', strategy=strategy) imputed_list = list(enc.fit_transform(null_list)) for i, value in enumerate(imputed_list): for j, c_value in enumerate(value): lines_list[i][cols_indices[j]] = str(c_value) for i, line in enumerate(lines_list): tp.write(','.join(line) + '\n') tp.flush() tp.seek(0) request.user.csvfile.delete() request.user.csvfile.save('csvfile.csv', ContentFile(tp.read())) except Exception as e: print e return JsonResponse({"status": "failed", "message": str(e)}) return JsonResponse({"status": 'success'})
@author: arellave """ #use of inbuilt pipeline. from sklearn import datasets import numpy as np # generate random symmetric psd matrix mat = datasets.make_spd_matrix(10) masking_array = np.random.binomial(1, 0.1, mat.shape).astype(bool) mat[masking_array] = np.nan print(mat[:4, :4]) #without pipeline from sklearn import preprocessing impute = preprocessing.Imputer() scaler = preprocessing.StandardScaler() mat_imputed = impute.fit_transform(mat) print("Imputing :") print(mat_imputed[:4, :4]) mat_imp_and_scaled = scaler.fit_transform(mat_imputed) print("Scaling :") print(mat_imputed[:4, :4]) #using pipeline from sklearn import pipeline pipe = pipeline.Pipeline([('impute', impute), ('scaler', scaler)]) print(pipe)
def main(): # Runtime start_time = time.time() # read in dataset gtd = pd.read_csv("GTD/globalterrorismdb (cleaned).csv", delimiter=",") # remove features gtd = gtd.drop(['country_txt'], axis=1) gtd = gtd.drop(['region_txt'], axis=1) gtd = gtd.drop(['attacktype_txt'], axis=1) gtd = gtd.drop(['targtype_txt'], axis=1) gtd = gtd.drop(['targsubtype_txt'], axis=1) gtd = gtd.drop(['weaptype_txt'], axis=1) gtd = gtd.drop(['area'], axis=1) gtd = gtd.drop(['city'], axis=1) gtd = gtd.drop(['property'], axis=1) gtd = gtd.drop(['propextent'], axis=1) gtd = gtd.drop(['propextent_txt'], axis=1) # REMOVED USED TO DROP TO 2 FEATURES # --------------------------------------------------------------- # Dropped after feature selection # gtd = gtd.drop(['nwound'], axis=1) # gtd = gtd.drop(['ishostkid'], axis=1) # gtd = gtd.drop(['attacktype'], axis=1) # gtd = gtd.drop(['nkill'], axis=1) # gtd = gtd.drop(['targtype'], axis=1) # gtd = gtd.drop(['targsubtype'], axis=1) # gtd = gtd.drop(['weaptype'], axis=1) # gtd = gtd.drop(['year'], axis=1) # gtd = gtd.drop(['success'], axis=1) # --------------------------------------------------------------- # REMOVED USED IN INTIAL RESULTS ONLY # --------------------------------------------------------------- # gtd = gtd.drop(['country'], axis=1) # gtd = gtd.drop(['region'], axis=1) # gtd = gtd.drop(['attacktype'], axis=1) # gtd = gtd.drop(['targtype'], axis=1) # gtd = gtd.drop(['targsubtype'], axis=1) # gtd = gtd.drop(['weaptype'], axis=1) # --------------------------------------------------------------- # top organisations print "\n\nTop organisations" print pd.value_counts(gtd['gname']) # new dataframe with only selected organisations test1 = gtd[gtd.gname == 'Taliban'] test2 = gtd[gtd.gname == 'Shining Path (SL)'] test3 = gtd[gtd.gname == 'Farabundo Marti National Liberation Front (FMLN)'] test4 = gtd[gtd.gname == 'Islamic State of Iraq and the Levant (ISIL)'] test5 = gtd[gtd.gname == 'Irish Republican Army (IRA)'] test6 = gtd[gtd.gname == 'Revolutionary Armed Forces of Colombia (FARC)'] test7 = gtd[gtd.gname == 'New People\'s Army (NPA)'] test8 = gtd[gtd.gname == 'Al-Shabaab'] test9 = gtd[gtd.gname == 'Basque Fatherland and Freedom (ETA)'] test10 = gtd[gtd.gname == 'Boko Haram'] test11 = gtd[gtd.gname == 'Kurdistan Workers\' Party (PKK)'] test12 = gtd[gtd.gname == 'Communist Party of India - Maoist (CPI-Maoist)'] test13 = gtd[gtd.gname == 'Liberation Tigers of Tamil Eelam (LTTE)'] test14 = gtd[gtd.gname == 'National Liberation Army of Colombia (ELN)'] test15 = gtd[gtd.gname == 'Tehrik-i-Taliban Pakistan (TTP)'] test16 = gtd[gtd.gname == 'Maoists'] test17 = gtd[gtd.gname == 'Palestinians'] test18 = gtd[gtd.gname == 'Nicaraguan Democratic Force (FDN)'] test19 = gtd[gtd.gname == 'Al-Qaida in the Arabian Peninsula (AQAP)'] test20 = gtd[gtd.gname == 'Manuel Rodriguez Patriotic Front (FPMR)'] frames = [ test1, test2, test3, test4, test5, test6, test7, test8, test9, test10, test11, test12, test13, test14, test15, test16, test17, test18, test19, test20 ] result = pd.concat(frames) # determine number of missing values in each column print "\n\nCheck missing values" print result.isnull().sum() # REMOVE TO RUN WITH 2 FEATURES # --------------------------------------------------------------- #impute the mean value for all missing values in the nkill, nwound, targsubtype, ishostkid column imputer = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=1) imputer.fit(result["nkill"]) newValues1 = imputer.transform(result["nkill"]) result["nkill"] = newValues1[0] imputer.fit(result["targsubtype"]) newValues2 = imputer.transform(result["targsubtype"]) result["targsubtype"] = newValues2[0] imputer.fit(result["nwound"]) newValues3 = imputer.transform(result["nwound"]) result["nwound"] = newValues3[0] imputer.fit(result["ishostkid"]) newValues4 = imputer.transform(result["ishostkid"]) result["ishostkid"] = newValues4[0] # --------------------------------------------------------------- print "\n\nCheck if missing values were removed" print result.isnull().sum() # encode categorical variables as continuous variables result['organisation'] = result['gname'].map({ 'Taliban': 0, 'Shining Path (SL)': 1, 'Farabundo Marti National Liberation Front (FMLN)': 2, 'Islamic State of Iraq and the Levant (ISIL)': 3, 'Irish Republican Army (IRA)': 4, 'Revolutionary Armed Forces of Colombia (FARC)': 5, 'New People\'s Army (NPA)': 6, 'Al-Shabaab': 7, 'Basque Fatherland and Freedom (ETA)': 8, 'Boko Haram': 9, 'Kurdistan Workers\' Party (PKK)': 10, 'Communist Party of India - Maoist (CPI-Maoist)': 11, 'Liberation Tigers of Tamil Eelam (LTTE)': 12, 'National Liberation Army of Colombia (ELN)': 13, 'Tehrik-i-Taliban Pakistan (TTP)': 14, 'Maoists': 15, 'Palestinians': 16, 'Nicaraguan Democratic Force (FDN)': 17, 'Al-Qaida in the Arabian Peninsula (AQAP)': 18, 'Manuel Rodriguez Patriotic Front (FPMR)': 19 }).astype(int) result = result.drop(['gname'], axis=1) print "\n\nData frame information" print result.info() # REMOVED USED IN INTIAL RESULTS ONLY # --------------------------------------------------------------- # perform one-hot encoding on Embarked column # result = pd.get_dummies(result, columns=["country_txt"]) # result = pd.get_dummies(result, columns=["region_txt"]) # result = pd.get_dummies(result, columns=["area"]) # result = pd.get_dummies(result, columns=["city"]) # result = pd.get_dummies(result, columns=["attacktype_txt"]) # result = pd.get_dummies(result, columns=["targtype_txt"]) # result = pd.get_dummies(result, columns=["targsubtype_txt"]) # result = pd.get_dummies(result, columns=["weaptype_txt"]) # -------------------------------------------------------------- # Next separate the class data from the training data target = result["organisation"] data = result.drop(["organisation"], axis=1) # REMOVED FEATURE SELECTION # --------------------------------------------------------------- # Univariate Feature Selection # feature_names = list(result.columns.values) # Selector_f = SelectPercentile(f_regression, percentile=25) # Selector_f.fit(data, target) # for n, s in zip(feature_names, Selector_f.scores_): # print 'F Score', s, "for feature", n # Tree-based Feature Selection # forest = ExtraTreesClassifier(n_estimators=250, random_state=0) # forest.fit(data, target) # importances = forest.feature_importances_ # for n, s in zip(feature_names, importances): # print 'F Score', s, "for feature", n # --------------------------------------------------------------- print "\n\nnumber of features" print len(result.columns) print "number of rows" print result.shape[0] # REMOVED USED IN INTIAL RESULTS ONLY # --------------------------------------------------------------- # print "\n\nRunning classifiers before standardization" # runClassifiers(data, target) # -------------------------------------------------------------- # Run standardization on the data scalingObj = preprocessing.StandardScaler() standardizedData = scalingObj.fit_transform(data) data = pd.DataFrame(standardizedData, columns=data.columns) # Split the data into a training set and a test set X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=0) # REMOVED USED AFTER RESULT ON TOP 10 (RESULT kernel = linear, C = 1) # Forcing best_params_ to above as I need the estimator object # --------------------------------------------------------------- # Hyper-parameter optimization on the data. print("\n\nRunning hyper-parameter optimization........") # param_grid = [{'kernel': ['rbf', 'poly', 'linear'], 'C': range(1, 15)}] param_grid = [{'kernel': ['linear'], 'C': range(1, 2)}] clf = GridSearchCV(SVC(), param_grid, cv=10) clf.fit(data, target) print("\n\nBest parameters set found on development set:") print(clf.best_params_) # --------------------------------------------------------------- # Run classifier classifier = svm.SVC(kernel=clf.best_params_["kernel"], C=clf.best_params_["C"]) y_pred = classifier.fit(X_train, y_train).predict(X_test) # Compute confusion matrix cnf_matrix = confusion_matrix(y_test, y_pred) # Plot non-normalized confusion matrix plt.figure() class_names = [ 'Taliban', '(SL)', '(FMLN)', 'ISIL)', '(IRA)', '(FARC)', '(NPA)', 'Al-Shabaab', '(ETA)', 'Boko Haram', '(PKK)', '(CPI-Maoist)', '(LTTE)', '(ELN)', '(TTP)', 'Maoists', 'Palestinians', '(FDN)', '(AQAP', '(FPMR)' ] plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix, without normalization') plt.show() # REMOVED USED IN INTIAL RESULTS ONLY # --------------------------------------------------------------- # print "\n\nRunning classifiers after standardization" # run_classifiers(data, target) # --------------------------------------------------------------- scores = model_selection.cross_val_score(clf.best_estimator_, data, target, cv=10) print "SVM : ", scores.mean() # Runtime print("--- %s seconds ---" % (time.time() - start_time))
def __init__(self, number_of_features, random_seed): self.__number_of_features = number_of_features # Documentation of function: # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html#sklearn.preprocessing.Imputer self.__imputer = preprocessing.Imputer( missing_values="NaN", strategy="median", verbose=1, axis=0, # Probably not necessary, but future-proof. Setting to false may improve # performance and reduce memory requirements. copy=True) # We want to manually control the number of features at each split in order to tune the algorithm for Spectrum # data. features_at_each_split = int(math.sqrt(number_of_features) + 0) # Unfortunately, this library doesn't allow early stopping when the next split has a higher impurity # than the current split. # # Documentation for this function is at: # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier.predict_log_proba self.__classifier = RandomForestClassifier( bootstrap=True, class_weight=None, criterion='gini', # Since early stopping options are limited, we might # need to tweak this number for short-term results. max_depth=None, max_features=features_at_each_split, # I don't think setting an arbitrary limit to leaf nodes is a good way to prevent over- # splitting, but this could potentially be tuned for early stopping purposes. max_leaf_nodes=None, # This is the option that lacks the early stopping method I want to use. # Since early stopping options are limited, this is another option that can be tuned to # avoid too many internal nodes. min_impurity_split=None, min_impurity_decrease=0.0, # 100 seems like a decent sample size, and may be a temporary solution to our early # stopping problem. Scratch that. Available data is much lower than previously mentioned. min_samples_split=10, min_samples_leaf=1, # Spectrum and recruiter data isn't weighted, so this should be zero. min_weight_fraction_leaf=0.0, # Seems like a good average number to start with according to this research paper: # https://www.researchgate.net/publication/230766603_How_Many_Trees_in_a_Random_Forest n_estimators=96, # Make sure this correctly detects the number of cores on the bare-metal # production server and runs on all them. n_jobs=-1, oob_score=False, random_state=random_seed, verbose=1, warm_start=False) # A pipeline is necessary when imputing missing values to avoid leaking statistics about the test data into # the model when cross-validating. # # Documentation of function: # http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline pipeline = Pipeline( [("imputer", self.__imputer), ("forest", self.__classifier)], # Enabling this may improve performance by caching memory=None) self.__pipeline = pipeline
# select numerical or categorical columns # since Scikit-Learn doesn't handle DataFrames yet class DataFrameSelector(base.BaseEstimator, base.TransformerMixin): def __init__(self, attribute_names): self.attribute_names = attribute_names def fit(self, X, y=None): return self def transform(self, X): return X[self.attribute_names] # build the pipeline for the numerical attributes imputer = preprocessing.Imputer(strategy="median") num_pipeline = pipeline.Pipeline([ ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])), ("imputer", preprocessing.Imputer(strategy="median")), ]) num_pipeline.fit_transform(train_data) #We will also need an imputer for the string categorical columns (the regular Imputer does not work on those): # Inspired from stackoverflow.com/questions/25239958 class MostFrequentImputer(base.BaseEstimator, base.TransformerMixin): def fit(self, X, y=None): self.most_frequent_ = pd.Series( [X[c].value_counts().index[0] for c in X], index=X.columns)
pd.dropna() pd.fillna() # plot data.boxplot(column = 'finish',by = 'material') ### Preprocessing------------------------------------------------------------- #Scale sklp.minmax_scale(data,(0,1)) # data must be numerical pd or np standardized_Dataset = sklp.scale(Dataset, axis=0) Normalized_Dataset = sklp.normalize(Dataset, norm='l2') binarized_Dataset = skp.binarize(Dataset,threshold=0.0) # Missing data imp = sklp.Imputer(missing_values=0,strategy='mean',axis=0) imp.fit_transform(Dataset) # PCA import sklearn.decomposition as skd pca = skd.PCA(n_components=n, whiten=False) pca.fit(Dataset) Dataset_Reduced_Dim = pca.transform(Dataset) # Train and Test x_train, x_test, y_train, y_test = sklm.train_test_split(x,y,test_size = 0.2) # Dummy encoding from statsmodels.tools import categorical cat_encod = categorical(data, dictnames=False, drop=False) #may need reshape(-1,1)
def start_preprocessing(): #loading all the different datasets labels = pd.read_csv('data/labels.csv', delimiter=";") disc = pd.read_csv('data/discrimination.csv', delimiter=",") domestic = pd.read_csv('data/domesticviolence.csv', delimiter=",") abortion = pd.read_csv('data/legalabortion.csv', delimiter=",") legislation = pd.read_csv('data/legislation.csv', delimiter=",") proper = pd.read_csv('data/property.csv', delimiter=",") marriage = pd.read_csv('data/marriage.csv', delimiter=",") sex = pd.read_csv('data/sexualharassment.csv', delimiter=",") gdp = pd.read_csv('data/gdpcap.csv')#, index_col=0)#.iloc[:,-6]#, delimiter=",") gini = pd.read_csv('data/gini.csv') edu = pd.read_csv('data/edu.csv', delimiter=";") agb = pd.read_csv('data/agb.csv', delimiter=";") #PREPROCESSING #getting rid of countries not present in all sets #extracting all the unique countries of in the different sets to compare discunique = disc.iloc[:,0].unique() labelsunique = labels.iloc[:,0].unique() uniquedomestic = domestic.iloc[:,0].unique() uniquelegislation = legislation.iloc[:,0].unique() uniqueproper = proper.iloc[:,0].unique() uniquemarriage = marriage.iloc[:,0].unique() uniquesex = sex.iloc[:,0].unique() uniquegdp = gdp.iloc[:,0].unique() uniquegini = gini.iloc[:,0].unique() uniqueedu = edu.iloc[:,0].unique() uniqueagb = agb.iloc[:,0].unique() #discarding examples in the discrimination data set that are not present in the label set discarddisc = [] for i in discunique: if i not in labelsunique: discarddisc.append(i) disc.drop(disc[disc.Economy == i].index[0], inplace=True) discarddom = [] for i in uniquedomestic: if i not in labelsunique: discarddom.append(i) domestic.drop(domestic[domestic.Economy == i].index[0], inplace=True) discardleg = [] for i in uniquelegislation: if i not in labelsunique: discardleg.append(i) legislation.drop(legislation[legislation.Economy == i].index[0], inplace=True) discardprop = [] for i in uniqueproper: if i not in labelsunique: discardprop.append(i) proper.drop(proper[proper.Economy == i].index[0], inplace=True) discardmar = [] for i in uniquemarriage: if i not in labelsunique: discardmar.append(i) marriage.drop(marriage[marriage.Economy == i].index[0], inplace=True) discardsex = [] for i in uniquesex: if i not in labelsunique: discardsex.append(i) sex.drop(sex[sex.Economy == i].index[0], inplace=True) countriessex = sex.iloc[:,0].unique() discardgdp = [] for i in uniquegdp: if i not in countriessex: discardgdp.append(i) gdp.drop(gdp[gdp.iloc[:,0] == i].index[0], inplace=True) discardgini = [] for i in uniquegini: if i not in countriessex: discardgini.append(i) gini.drop(gini[gini.iloc[:,0] == i].index[0], inplace=True) discardedu = [] for i in uniqueedu: if i not in countriessex: discardedu.append(i) edu.drop(edu[edu.iloc[:,0] == i].index[0], inplace=True) discardagb = [] for i in uniqueagb: if i not in countriessex: discardagb.append(i) agb.drop(agb[agb.iloc[:,0] == i].index[0], inplace=True) neunique = sex.iloc[:,0].unique() labeldel = [] for i in labelsunique: if i not in neunique: labeldel.append(i) labels.drop(labels[labels.Country == i].index[0], inplace=True) uniqueabortion = abortion.iloc[:,0].unique() labelsunique = labels.iloc[:,0].unique() discardabor = [] for i in uniqueabortion: if i not in labelsunique: discardabor.append(i) abortion.drop(abortion[abortion.COUNTRY == i].index[0], inplace=True) gdptrim = gdp.iloc[:,[0,-6]] # extracting the gdppercap 2013 columns, and the corresponding country ginitrim = gini.iloc[:,[0,-6]] # extracting the gini 2013 columns, and the corresponding country gdptrim.is_copy = False ginitrim.is_copy = False #adding generic name "Economy" gdptrim.rename(columns={'Country Name': 'Economy'}, inplace=True) ginitrim.rename(columns={'Country Name': 'Economy'}, inplace=True) abortion.rename(columns={'COUNTRY': 'Economy'}, inplace=True) #filling in nan values for gini means = {} #creating dictionary to hold mean gini value of 7 different regions temp = list(sex.RegionName.unique()) #extracting the 7 different regions for i in temp: #iterating over the the regions countries = list(sex[sex.iloc[:,1] == i].iloc[:,0]) #extracting all the countries that belong to this region col_gini = 0 #initiating int to hold cumulated gini coefficient for region k = 0 #counting the number of countries that contribute to the mean for country in countries: #iterating over the extracted countries gini = ginitrim[ginitrim.iloc[:,0] == country].iloc[0][1] #extracting gini for country if np.isnan(gini) == False: #if gini is not nan k +=1 #increment k col_gini += gini #add to col_gini means[i] = col_gini/k #assign mean gini value for region to dictionary countries = ginitrim[np.isnan(ginitrim.iloc[:,1])].iloc[:,0] #extracting NaN Gini countries for i in countries:#iterating over the countries that has NaN values index = ginitrim[ginitrim.iloc[:,0] == i].index[0] #getting the index value = means[sex[sex.iloc[:,0] == i].iloc[:,1][sex[sex.iloc[:,0] == i].iloc[:,1].index[0]]] #getting value ginitrim.set_value(index, '2013', value) #inputting mean value to frame #mergin all sets into one dataframe main = pd.DataFrame.merge(disc, domestic, how='outer') main = pd.DataFrame.merge(main, legislation, how='outer') main = pd.DataFrame.merge(main, proper, how='outer') main = pd.DataFrame.merge(main, marriage, how='outer') main = pd.DataFrame.merge(main, sex, how='outer') main = pd.DataFrame.merge(main, abortion, how='outer') main = pd.DataFrame.merge(main, gdptrim, how='outer') main = pd.DataFrame.merge(main, ginitrim, on='Economy') main = pd.DataFrame.merge(main, edu, on='Economy') main = pd.DataFrame.merge(main, agb, on='Economy') #setting index of main frame to Economy main = main.set_index('Economy') #bulding a dataset of only women rights copy = main.iloc[:,:-4].copy() #temporary copy frame excluded the continuous values copy = copy.drop('RegionName', axis=1) #dropping the geographical feature RegionName from rights set rightsatt = list(copy.columns) #Extracting a list containing feature names, for plotting and investigation purposes rights = np.zeros(copy.shape) #creating a numpy matrix to hold binary values for idx, i in enumerate(copy.itertuples()): #iterating over copy frame for indel, item in enumerate(i): #iterating over columns if indel != 0: #if it's not the first column, i.e. economy name if item == 'Yes': rights[idx, indel-1] = 1 #1 if yes to question elif item == 'No': rights[idx, indel-1] = 0 #0 if no to question else: rights[idx, indel-1] = np.nan #nan if empty copy = main.copy() #new temp frame that contains all features copy = copy.drop('RegionName', axis=1) #deleting RegionName feature x = np.zeros((119, copy.shape[1] + 7)) # creating a numpy matrix to contain all features plus regionnames allatt = list(copy.columns) #extracting the list of attributes for i in list(main.RegionName.unique()): #appending the region names at the end of the attributes list allatt.append(i) regions = list(main.RegionName.unique()) #extracting region names for idx, i in enumerate(copy.itertuples()): #iterating over copy frame for indel, item in enumerate(i): #iterating over columns if indel == 0: #using Economy name to get region x[idx, 42 + regions.index(main.RegionName[item])] = 1 #dependent on the region's index in the region list elif indel == 39: #if the indel is GDP per cap if idx == 105: #if the country is 105: Syria x[idx, indel-1] = 35164000000 / 19810000 #manually inputting the GPDpercap for Syria else: x[idx, indel-1] = item #otherwise input gdppercap for country elif indel == 40: #if the indel is gini x[idx, indel-1] = item elif indel == 41: #if the indel is education x[idx, indel-1] = item elif indel == 42: #if indel is age at first birth x[idx, indel-1] = item else: #the feature is binary if item == 'Yes': x[idx, indel-1] = 1 elif item == 'No': x[idx, indel-1] = 0 else: x[idx, indel-1] = np.nan #changing name in attribute list allatt[allatt.index('2013_x')] = 'GDP per Cap' allatt[allatt.index('2013_y')] = 'GINI' allatt[allatt.index('2013')] = 'Education' #deleting excess columns in labels data del labels['ISO3'] del labels['2013 Rank'] labels = labels.set_index('Country') #setting the country as index labels = labels.sort_index() #sorting to correspond sequence in x and rights #fitting an imputer, to replace NaNs with mode: a total of 30 countries have 15 or 16 nans rightsprep = preprocessing.Imputer(axis=0, strategy="most_frequent").fit(rights) rights = rightsprep.transform(rights) #fitting an impute to replace NaNs with mode in complete set xprep = preprocessing.Imputer(axis=0, strategy="most_frequent").fit(x) x = xprep.transform(x) return rights, rightsatt, x, allatt, labels, main
titanic_test = pd.read_csv("test.csv") #Note that you have to do the same work on test as well #EDA titanic_test.shape titanic_test.info() titanic_test1 = pd.get_dummies(titanic_test, columns=['Pclass', 'Sex', 'Embarked']) titanic_test1.shape titanic_test1.info() titanic_test1.head(6) titanic_test1.describe() X_test = titanic_test1.drop(['PassengerId', 'Age', 'Cabin', 'Ticket', 'Name'], 1) X_test.info() #create an instance of Imputer class with required arguments mean_imputer = preprocessing.Imputer() #Default value for Imputer is mean #compute mean of age and fare respectively mean_imputer.fit(X_test[['Fare']]) #fill up the missing data with the computed means X_test[['Fare']] = mean_imputer.transform(X_test[['Fare']]) titanic_test['Survived'] = lr_grid_estimator.predict(X_test) titanic_test.to_csv('submission.csv', columns=['PassengerId', 'Survived'], index=False)
titanic_test.shape titanic_test.info() #merge train and test data titanic_all = pd.concat([titanic_train, titanic_test], axis=0) titanic_all.shape titanic_all.info() #explore missing data titanic_all.apply(lambda x: sum(x.isnull())) #pre-process Embarked titanic_all.Embarked[titanic_all['Embarked'].isnull()] = 'S' #pre-process Age age_imputer = preprocessing.Imputer() titanic_all[['Age']] = age_imputer.fit_transform(titanic_all[['Age']]) #create family size feature def size_to_type(x): if (x == 1): return 'Single' elif (x >= 2 and x <= 4): return 'Small' else: return 'Large' titanic_all['FamilySize'] = titanic_all.SibSp + titanic_all.Parch + 1 titanic_all['FamilyType'] = titanic_all['FamilySize'].map(size_to_type)
# 1.1.2 区间缩放:将特征值缩放到[0, 1]区间的数据(对列向量处理) features_new = preprocessing.MinMaxScaler().fit_transform(features) # 1.1.3 归一化:将行向量转化为“单位向量”(对每个样本处理) features_new = preprocessing.Normalizer().fit_transform(features) # 1.2 对定量特征二值化:设定一个阈值,大于阈值的赋值为1,小于等于阈值的赋值为0 features_new = preprocessing.Binarizer(threshold=3).fit_transform(features) # 1.3 对定性(分类)特征编码(也可用pandas.get_dummies函数) enc = preprocessing.OneHotEncoder() enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # print(enc.transform([[0, 1, 3]])) # print(enc.transform([[0, 1, 3]]).toarray()) # 1.4 缺失值计算(也可用pandas.fillna函数) imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0) features_new = imp.fit_transform( vstack((array([nan, nan, nan, nan]), features))) # 1.5 数据变换 # 1.5.1 基于多项式变换(对行变量处理) features_new = preprocessing.PolynomialFeatures().fit_transform(features) # 1.5.2 基于自定义函数变换,以log函数为例 features_new = preprocessing.FunctionTransformer( np.log1p).fit_transform(features) ''' 2.特征选择 ''' # 2.1 Filter # 2.1.1 方差选择法,选择方差大于阈值的特征 features_new = feature_selection.VarianceThreshold(
def impute_continuous_features(df, features): cont_imputer = preprocessing.Imputer() cont_imputer.fit(df[features]) print(cont_imputer.statistics_) df[features] = cont_imputer.transform(df[features])
algorithms={ 'Regression': ['All','ExtraTreeRegressor','GradientBoostingRegressor','DecisionTreeRegressor','LinearSVR',\ 'RandomForestRegressor','XGBRegressor','KNeighborsRegressor','LinearRegression'], 'Classification': ['All','DecisionTreeClassifier','ExtraTreesClassifier','RandomForestClassifier','GradientBoostingClassifier',\ 'KNeighborsClassifier','LinearSVC','LogisticRegression','XGBClassifier'] } optionsForDropdown={ "changedataTypes": ['None',"Continuous", "Categorical"], "imputation_methods": ['None',"Mean", "Median", "Mode", "Back fill", "Forward fill"], "data_transformation_steps": ["None", "One Hot Encoding", "Label Encoding", "Normalize", "Scaling Standard", "Scaling Min Max", "Scaling Max Absolute"], "algorithmTypes":algorithms } processe_short={'Mean':preprocessing.Imputer(strategy="mean"), 'Median':preprocessing.Imputer(strategy="median"), 'Mode':preprocessing.Imputer(strategy="mode"), 'Scaling Min Max':preprocessing.MinMaxScaler(), 'Scaling Standard':preprocessing.StandardScaler(), 'Label Encoding':preprocessing.LabelEncoder(), 'One Hot Encoding':preprocessing.LabelBinarizer(), 'Normalize':preprocessing.StandardScaler(), 'Scaling Max Absolute':preprocessing.MinMaxScaler(),} class AutoMLUtilities: def dataDescription(self,data): dataDtype=dict(data.dtypes) dataMissingVal=dict(data.isnull().sum())
def __init__(self, n, d): self.transformer_ = preprocessing.Imputer(strategy='most_frequent') self.param_grid_ = {}