def bnp_svm(train, test): print('bnpsvm') ## If a value is missing, set it to the average imp = Imputer(missing_values='NaN', strategy='mean', axis=0) #print("cleaning data") train = train.sample(1000) ## set up training data train1 = train.select_dtypes(include=['float64']) imp.fit(train1) train1 = imp.transform(train1) train1 = np.array(train1).astype(float) ## set up real y target = np.array(train['target']).astype(int) ## set up testing data test1 = test.select_dtypes(include=['float64']) test1 = imp.transform(test1) test1 = np.array(test1).astype(float) #print("training...") clf = svm.SVC(gamma=0.001, C=100, probability=True) #print("testing") clf.fit(train1, target) #print("predicting") yhat = clf.predict_proba(test1) return yhat #print(bnp_svm(train, test))
def test(): vec = DictVectorizer() imp = Imputer(missing_values='NaN', strategy='mean', axis=0) for filename in glob.glob(r'../dataset/UCI/*.arff'): basename = re.sub(r'(\..*?)$','',os.path.basename(filename)) print basename if basename != DS: continue # cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb')) data = arff.loadarff(filename)[0] X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray() imp.fit(X) X = imp.transform(X) labels = np.array([row[-1] for row in data]) y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels]) random = np.random.permutation(range(len(X))) print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())])) for iteration in xrange(10): X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10) for train, test in kf: length, train_size = len(train), 0.1 X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0) for R in xrange(2,10): ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]]) # print "%s R=%d"%(basename,R), cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix) exit()
def clf_fit_transform(self): #import dataset self.df= pd.read_csv(self.dataset,na_values=["?"]) #clean dataset #use median,most_frequent,mean imr = Imputer(missing_values='NaN', strategy='mean', axis=0,copy=False) imr.fit(self.df) X_imputed_df = pd.DataFrame(imr.transform(self.df.values), columns = self.df.columns) X_imputed_df.drop(['id'],1,inplace=True) X= np.array(X_imputed_df.drop(['class'],1)) y=np.array(X_imputed_df['class']) le= LabelEncoder() y=le.fit_transform(y) #cross validation self.X_train, self.X_test ,self.y_train,self.y_test = cross_validation.train_test_split(X,y,test_size=0.2,random_state=0) # define the object self.stdsc = StandardScaler() self.X_train_std= self.stdsc.fit_transform(self.X_train) # once it learns it can apply on other inputs self.X_test_std= self.stdsc.transform(self.X_test)
def load_datasets(feature_paths, label_paths): feature = np.ndarray(shape=(0,41)) label = np.ndarray(shape=(0,1)) for file in feature_paths: # 使用pandas库的read_table函数读取一个特征文件内容 # 指定分隔符为逗号 缺失值为问号 文件中不包含表头行 df = pd.read_table(file, delimiter=',', na_values='?', header=None) # 使用Imputer函数,通过设定strategy参数为'mean' # 使用平均值对缺失数据补全,fit()函数用于训练预处理器, # transform()函数用于生成预处理结果 imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(df) df = imp.transform(df) # 将预处理后的数据加入feature,依次遍历完所有特征文件 feature = np.concatenate((feature, df)) for file in label_paths: # 同上 df = pd.read_table(file, header=None) # 标签文件没有缺失值,所以直接将读取到的新数据加入label集合 label = np.concatenate((label, df)) label = np.ravel(label) # 将特征集合feature与标签集合label返回 return feature, label
def load_datasets(feature_paths, label_paths): ''' 读取特征文件和标签文件并返回 ''' #定义feature数组变量,列数量和特征维度一致为41;定义空的标签变量,列数量与标签维度一致为1 feature = np.ndarray(shape=(0,41)) label = np.ndarray(shape=(0,1)) for file in feature_paths: #使用pandas库的read_table函数读取一个特征文件的内容,其中指定分隔符为逗号、缺失值为问号且文件不包含表头行 #df = pd.read_table(file, delimiter=',', na_values='?', header=None) #pandas.read_csv(数据源, encoding=编码格式为utf-8, parse_dates=第0列解析为日期, index_col=用作行索引的列编号) data=pd.read_csv(file,encoding='utf-8',parse_dates=[0],index_col=0) #DataFrame.sort_index(axis=0 (按0列排), ascending=True(升序), inplace=False(排序后是否覆盖原数据)) #data 按照时间升序排列 #data.sort_index(0,ascending=True,inplace=True) #使用Imputer函数,通过设定strategy参数为‘mean’,使用平均值对缺失数据进行补全。 imp = Imputer(missing_values='NaN', strategy='mean', axis=0) #fit()函数用于训练预处理器,transform()函数用于生成预处理结果。 imp.fit(df) df = imp.transform(df) #将预处理后的数据加入feature,依次遍历完所有特征文件 feature = np.concatenate((feature, df)) #读取标签文件 for file in label_paths: df = pd.read_table(file, header=None) label = np.concatenate((label, df)) #将标签归整化为一维向量 label = np.ravel(label) return feature, label
def fit(self, train_x, train_y=None, is_norm=True): # Normalization if is_norm: train_x_min = train_x.min(0) train_x_ptp = train_x.ptp(axis=0) train_x = train_x.astype(float) - train_x_min / train_x_ptp if np.any(train_y): train_y = train_y.astype(float) - train_x_min / train_x_ptp imp = Imputer(missing_values='NaN', strategy='mean', axis=1) imp.fit(train_x) if np.isnan(train_x).any(): log("Found {} NaN values in train_x, so try to transform them to 'mean'".format(np.isnan(train_x).sum()), WARN) train_x = imp.transform(train_x) if np.any(train_y) and np.isnan(train_y).any(): log("Found {} NaN values in train_y, so try to transform them to 'mean'".format(np.isnan(train_y).sum()), WARN) train_y = imp.transform(train_y) if np.any(train_y): self.model.fit(train_x, train_y) else: self.model.fit(train_x)
class ImputeCategorical(BaseEstimator, TransformerMixin): """ Encodes a specified list of columns or all columns if None. """ def __init__(self, columns=None): self.columns = columns self.imputer = None def fit(self, data, target=None): """ Expects a data frame with named columns to impute. """ # Encode all columns if columns is None if self.columns is None: self.columns = data.columns # Fit an imputer for each column in the data frame self.imputer = Imputer(missing_values=0, strategy='most_frequent') self.imputer.fit(data[self.columns]) return self def transform(self, data): """ Uses the encoders to transform a data frame. """ output = data.copy() output[self.columns] = self.imputer.transform(output[self.columns]) return output
def data_preprocessing_descriptive(Extracted_Features,Coma_Features,Corrected_Features): lvltrace.lvltrace("LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive") tools.separate_coma(Extracted_Features,Coma_Features) for root, dirs, files in os.walk(Coma_Features): for i in files: if not i.startswith('.'): input_i=Coma_Features+i output_i=Corrected_Features+i lines=tools.file_lines(input_i) ncol=tools.file_col(input_i) if lines >= 2: file = open(output_i, "w") writer=csv.writer(file, lineterminator='\t') data = np.genfromtxt(input_i,delimiter=',') X = data[1:, 2:] neuron_type = np.genfromtxt(input_i,delimiter=',',dtype=None) y = neuron_type[:, 0] # (class) neuron_name = np.genfromtxt(input_i,delimiter=',',dtype=None) z = neuron_name[:, 1] # Neuron names features = np.genfromtxt(input_i,delimiter=',',dtype=None) w = features[0, :] # features names #Replace missing values 'nan' by column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) # Output replacement "Nan" values Y=imp.transform(X) #print i #print Y.shape, y.shape,z.shape #print Y.shape[1] #################### for line in xrange(Y.shape[0]+1): for colonne in xrange(Y.shape[1]+2): if line == 0: if colonne == 0: file.write("%s\t"%y[line]) else: if colonne == 1: file.write("%s\t"%z[line]) else: file.write("%s\t"%w[colonne]) else: if colonne == 0: file.write("%s\t"%y[line]) else: if colonne == 1: file.write("%s\t"%z[line]) else: file.write("%f\t"%Y[line-1,colonne-2]) file.write("\n") ######################### else: print "Only one morphology !!!" file.close() lvltrace.lvltrace("LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive")
def eval_func(chromosome): t_par = chromosome.getInternalList() print("## Start with Individual : " + str(t_par)) eta = t_par[0] max_depth = t_par[1] subsample = t_par[2] colsample_bytree = t_par[3] n_estimators = t_par[4] test_size = t_par[5] imp_start = t_par[6] num_of_feat_corr = t_par[7] print("## Filling missing data") imp = Imputer(missing_values='NaN', strategy=imp_start, axis=0) imp.fit(train[features]) train[features] = imp.transform(train[features]) test[features] = imp.transform(test[features]) curr_features = copy.deepcopy(features) print("## Creating Random features based on Correlation") output_cor = correlation_p[output_col_name].sort_values() most_neg_cor = list(output_cor.index[0:num_of_feat_corr].ravel()) most_pos_cor = list(output_cor.index[(-2-num_of_feat_corr):-2].ravel()) for f1, f2 in pairwise(most_neg_cor): train[f1 + "_" + f2] = train[f1] + train[f2] test[f1 + "_" + f2] = test[f1] + test[f2] curr_features += [f1 + "_" + f2] for f1, f2 in pairwise(most_pos_cor): train[f1 + "_" + f2] = train[f1] + train[f2] test[f1 + "_" + f2] = test[f1] + test[f2] curr_features += [f1 + "_" + f2] params = {"objective": "binary:logistic", "eta": eta, "nthread":3, "max_depth": max_depth, "subsample": subsample, "colsample_bytree": colsample_bytree, "eval_metric": "logloss", "n_estimators": n_estimators, "silent": 1 } num_boost_round = 10000 test_size = test_size best_score = train_model(curr_features,params,num_boost_round,test_size) grid_search_pd.loc[len(grid_search_pd),grid_search_columns] = [eta,max_depth,subsample,colsample_bytree,n_estimators,test_size,imp_start,num_of_feat_corr,best_score] timestamp = time.strftime("%Y%m%d-%H%M%S") print("########################## Round Time Stamp ==== " + timestamp) grid_search_pd.to_csv(grid_search_file, index=False) return best_score
def trainSVM(x1,x2,kernel): # prepare data x1 = map(list,x1) x2 = map(list,x2) X = x1+x2 y1 = ones((shape(x1)[0],1)) y2 = -1*ones((shape(x2)[0],1)) Y = list(y1)+list(y2) Y = ravel(Y) #print 'Y' if (kernel == 0): svm = LinearSVC() #Instantiating the SVM LINEAR classifier. params = {'C': [1, 10, 50, 100,200,300]} #Defining the params C which will be used by GridSearch. Param C does increase the weight of the 'fails'. grid = GridSearchCV(svm, params, cv=5) else: svm = SVC(probability=True) #Instantiating the SVM RBF classifier. params = {'C': [50, 100,200,300]} #Defining the params C & Gamma which will be used by GridSearch. Param C does increase the weight of the 'fails'. Gamma does define the std of a gaussian. grid = GridSearchCV(svm, params, cv=5) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) trainData = imp.transform(X) grid.fit(trainData, Y) #Run fit with all sets of parameters. model = grid.best_estimator_ return model
def preprocess_apply(data, missingvaluemethod, preprocessingmethods): #imputing missing values if missingvaluemethod!=Constants.MISSING_VALUE_METHOD_NONE: if missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEAN: imp = Imputer(missing_values='NaN', strategy='mean', axis=0) elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEDIAN: imp = Imputer(missing_values='NaN', strategy='median', axis=0) elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MOST_FREQUENT: imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(data) data=imp.transform(data) else: data=np.asarray(data) #scale data res=np.array([]) for i in range(0,len(preprocessingmethods)): field=[[x[i]] for x in data] if preprocessingmethods[i]==Constants.SCALING_METHOD_NONE: pass elif preprocessingmethods[i]==Constants.SCALING_METHOD_STANDARDIZATION: scaler=preprocessing.StandardScaler().fit(field) field=scaler.transform(field) elif preprocessingmethods[i]==Constants.SCALING_METHOD_MINMAX: field=preprocessing.MinMaxScaler().fit_transform(field) elif preprocessingmethods[i]==Constants.SCALING_METHOD_CATEGORICAL: enc = preprocessing.OneHotEncoder() enc.fit(field) field=enc.transform(field).toarray() if i==0: res = field else: res = np.concatenate((res, field), axis=1) return res
class FeaturePreProcesser(): def __init__(self): pass def fit(self,X): self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) self.imputer.fit(X) X = self.imputer.transform(X) self.std_scaler = StandardScaler() self.std_scaler.fit(X) def fit_transform(self, X): self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) self.imputer.fit(X) X = self.imputer.transform(X) self.std_scaler = StandardScaler() self.std_scaler.fit(X) X = self.std_scaler.transform(X) return X def transform(self, X): X = self.imputer.transform(X) X = self.std_scaler.transform(X) return X
class ImputerWrapper: """ A simple wrapper around Imputer and supports using zero to fill in missing values. If entire column is nan it gets filled with 0 to avoid Imputer removing the column. """ def __init__(self, missing_values='NaN', strategy='zero', axis=0, verbose=0, copy=False): self.strategy = strategy self.imputer = None if strategy != 'zero': self.imputer = Imputer(missing_values, strategy, axis, verbose, copy) def prepare(self, X): for j in range(X.shape[1]): all_nan = True for i in range(X.shape[0]): if not numpy.isnan(X[i][j]): all_nan = False break if all_nan: logging.info('column %d all nan, filling with 0' % j) for i in range(X.shape[0]): X[i][j] = 0.0 def fit(self, X, y=None): if self.strategy == 'zero': return self self.prepare(X) self.imputer.fit(X, y) return self def fit_transform(self, X, y=None, **fit_params): if self.strategy == 'zero': for i in range(X.shape[0]): for j in range(X.shape[1]): if numpy.isnan(X[i][j]): X[i][j] = 0.0 return X self.prepare(X) return self.imputer.fit_transform(X, y, **fit_params) def get_params(self, deep=True): if self.strategy == 'zero': return None return self.imputer.get_params(deep) def set_params(self, **params): if self.strategy == 'zero': return self self.imputer.set_params(**params) return self def transform(self, X): if self.strategy == 'zero': for i in range(X.shape[0]): for j in range(X.shape[1]): if numpy.isnan(X[i][j]): X[i][j] = 0.0 return X return self.imputer.transform(X)
def my_imputer(name,strat,value): if value == 0: data[name] = data[name].fillna(0) imp = Imputer(missing_values=value, strategy=strat, axis=0) x = data[name] x = x.reshape(-1,1) imp.fit(x) data[name] = imp.transform(x)
def ImputeAndGetFinalTrainTestData(train,test): X_train = train[:,:-1]; y_train = train[:,-1]; imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X_train); X_train = imp.transform(X_train); X_test = imp.transform(test.as_matrix()); return (X_train,y_train,X_test)
def imput_data(data): numSubsets = data.shape[-1] for i in range(numSubsets): imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(data[:,:,i]) data[:,:,i] = imp.transform(data[:,:,i]) data[:,-1,i] = preprocessing.scale(data[:,-1,i]) return data
def test_threshold_SGD(): train = pandas.read_csv('data/train_v2.csv') # test = pandas.read_csv('data/test_v2.csv') train_loss = train.loss # train = train[['f527', 'f528', 'f274', 'f271', 'f2', 'f727', 'f337', 'f431', 'f757']] # test = test[['f527', 'f528', 'f274', 'f271', 'f2', 'f727', 'f337', 'f431', 'f757']] # train = train[['f527', 'f528', 'f274', 'f271']] # test = test[['f527', 'f528', 'f274', 'f271']] imp = Imputer() imp.fit(train) train = imp.transform(train) # test = imp.transform(test) train=pre.StandardScaler().fit_transform(train) # test=pre.StandardScaler().fit_transform(test) train_loss_array = train_loss.apply(lambda x: 1 if x>0 else 0).values clf = SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=6, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None) clf.fit(train,train_loss_array) train = clf.transform(train, threshold = "1.25*mean") print train.shape kf = StratifiedKFold(train_loss.values, n_folds=10, indices=False) threshold = 0.999999999164 mean_mae = 0. for train_i, test_i in kf: # print len(train_i) X_train_split, X_test_split, y_train_split, y_test_split = train[train_i], train[test_i], train_loss_array[train_i], train_loss_array[test_i] y_test_split_initial = train_loss[test_i].values clf = SGDClassifier(loss='log', penalty='l2', alpha=1e-4, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=6, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None) clf.fit(X_train_split,y_train_split) probas_ = clf.predict_proba(X_test_split) prediction_proba = probas_[:,1] predictionIndexes0 = np.where(prediction_proba <= threshold)[0] predictionIndexes1 = np.where(prediction_proba > threshold)[0] prediction = np.asarray([0.] * y_test_split_initial.shape[0]) prediction[predictionIndexes1] = 10. prediction[predictionIndexes0] = 0. mae = mean_absolute_error(y_test_split_initial, prediction) mean_mae += mae print "Split MAE: " + str(mae) mean_mae = mean_mae / 10. print "Average MAE: " + str(mean_mae)
def typetransform ( data ): if dattype( data ) is unicode: le.fit( data ) data = le.transform( data ) else: imp = Imputer(missing_values='NaN', strategy='mean',axis = 1) imp.fit(data) data = imp.transform( data ) return data
def main(): s = pd.Series([1,2,3,np.NaN,5,6,None]) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit([1,2,3,4,5,6,7]) x = pd.Series(imp.transform(s).tolist()[0]) print(x)
def MisingValuesFiller(X_train): imp = Imputer(missing_values='NaN', strategy='mean', axis=0) X_full = np.array(X_train) imp.fit(X_full) #print X_full X_tests= [[np.nan, 2], [6, np.nan]] #print(imp.transform(X_tests)) return imp
def impute(self, sample): """ Create a Sample imputation model according to the method specified in the descriptor. :param sample: Sample instance to create the imputer for. :returns: Imputer instance. """ imp = Imputer(missing_values='NaN', strategy=self.fix_method, axis=0) imp.fit(sample.attributes) return imp
def imputator(features): """Fill in missing values with mean of the remaining samples Keyword arguments: features -- feature matrix """ imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(features) return imp.transform(features)
def __init__(self, alldata, labels): data = deepcopy(alldata) print("00: (%d,%d)" % (data.shape[0], data.shape[1])) imp = Imputer(missing_values='NaN', strategy='median', axis=0) imp.fit(data) self.data = deepcopy(imp.transform(data)) print("0: (%d,%d)" % (self.data.shape[0], self.data.shape[1])) le = LabelEncoder() le.fit(['f', 't']) self.labels = le.transform(labels)
def get_prob(clf, t1, t2, feat_table, feature_names): feat_values = apply_feat_fns(t1, t2, feat_table) feat_values = pd.Series(feat_values) feat_values = feat_values[feature_names] v = feat_values.values if mg._impute_flag == True: imp = Imputer(missing_values='NaN', strategy='median', axis=0) imp.fit(v) v = imp.transform(v) p = clf.predict_proba(v) return p[0]
def eliminate_features(): use_sample = False if use_sample: train = pandas.read_csv('data/train_v2_sample_10k.csv') # test = pandas.read_csv('data/test_v2_sample_10k.csv') average_best_t = 0.148846575958 else: train = pandas.read_csv('data/train_v2.csv') # test = pandas.read_csv('data/test_v2.csv') ### To use on full train set average_best_t = 0.164463473639 train_loss = train.loss cols = set(train.columns) cols.remove('loss') cols = list(cols) train = train[cols] column_names = train.columns.values.tolist() # train = train[['f527', 'f528', 'f274', 'f271', 'f2', 'f727', 'f337', 'f431', 'f757']] imp = Imputer() imp.fit(train) train = imp.transform(train) # test = imp.transform(test) train=pre.StandardScaler().fit_transform(train) # test=pre.StandardScaler().fit_transform(test) train_loss_array_libsvm = train_loss.apply(lambda x: -1 if x>0 else 1).values # b = np.delete(train,0,1) # c = np.delete(train,1,1) # print b.shape[1] # print c.shape best_acc = 91.3437 best_eliminated_features = [] best_features = [18, 289, 290, 17, 402, 19, 560, 16, 287, 310, 403] selected_train = train[:,best_features] os.chdir(liblinear_path) train_command = "./train -s 5 -c 0.01 -v 5 -e 0.001 /home/ema/Workspace/Projects/Kaggle/Loan_Default_Prediction/data/train_tmp.liblinear" datasets.dump_svmlight_file(selected_train, train_loss_array_libsvm, "/home/ema/Workspace/Projects/Kaggle/Loan_Default_Prediction/data/train_selected_f.liblinear", zero_based=False, comment=None, query_id=None) generation = 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('-s', type=str, help="SST data file", required=True) parser.add_argument('-p', type=str, help="Precipitation data file") args = parser.parse_args() sstFile = args.s precFile = args.p sstData = read_file(sstFile) sstData = np.transpose(parse_sst(sstData))[564:-74,:] imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(sstData) pickle_data(imp.transform(sstData),sstFile)
class RandomForestLearner(Orange.classification.SklFitter): def __init__(self, n_estimators=10, max_features="auto", random_state=None, max_depth=3, max_leaf_nodes=5): self.params = vars() def fit(self, X, Y, W): self.imputer = Imputer() self.imputer.fit(X) X = replace_nan(X, self.imputer) rf_model = RandomForest(**self.params) rf_model.fit(X, Y.ravel()) return RandomForestClassifier(rf_model, self.imputer)
def impute(data_dict, keys): from sklearn.preprocessing import Imputer for key in keys: x = [data_dict[k][key] for k in data_dict.keys()] imp = Imputer(missing_values='NaN', strategy="mean", axis=1) imp.fit(x) x = imp.transform(x)[0] names = data_dict.keys() for j in range(0, len(data_dict.keys())): data_dict[names[j]][key] = x[j] return data_dict
def start(train_X, train_Y): print("Starting imputation of Training Set...\n") imputer = Imputer(missing_values="NaN", strategy='mean', axis=0); imputer.fit(train_X); train_X = imputer.transform(train_X); train_Y = [y if y <= 69.0 else 69.0 for y in train_Y]; #Capping the rain at 69mm/hr train_Y = np.array(train_Y); print("Imputation Completed\n") parameters_to_try = generateParams(); print("No of Paramters to test " + str(len(parameters_to_try))); print("Copying Parameters"); results = []; #Contruct parameters as list batch_size = 2; for i in xrange(0, len(parameters_to_try), batch_size): models_to_try = [ (copy.copy(train_X), copy.copy(train_Y), parameters_to_try[i] ) ]; print("Releaseing a batch") if i+1 < len(parameters_to_try) : models_to_try.append( (copy.copy(train_X), copy.copy(train_Y), parameters_to_try[i+1] ) ); #Create a Thread pool. pool = Pool(2); results_t = pool.map( train_model_wrapper, models_to_try ); pool.close(); pool.join(); del models_to_try; results.append(results_t); best_params = None; best_crps = sys.float_info.max; for i in range(0, len(results)): if results[i][1] < best_crps: best_crps = results[i][1]; best_params = results[i][0]; print("Best Params : " + str(best_params)); print("Best CRPS : " + str(best_crps)); estimator = RandomForestRegressor(**best_params) estimator.fit(train_X, train_Y); return imputer, estimator;
def buildArraysFromROOT(tree,allowedFeatures,cut,skipEvents,maxEvents,name): dataContainer = {} featureNames = [] eventCounter = -1 gROOT.Reset() # Get branch names for item in tree.GetListOfBranches(): featureName = item.GetName() if featureName in allowedFeatures: featureNames.append(featureName) dataContainer[featureName] = [] # Build the event list tcut = TCut(cut) tree.Draw(">>eventList",tcut) eventList = TEventList() eventList = gDirectory.Get("eventList") nSelectedEvents = eventList.GetN() # Event loop for i in range(0,nSelectedEvents): if (i < skipEvents): continue if (i % 100 == 0): sys.stdout.write("Reading %s: %d%% \r" % (tree.GetName(),100*i/(maxEvents+skipEvents)) ) sys.stdout.flush() if i >= (maxEvents+skipEvents): break selectedEvNum = eventList.GetEntry(i) tree.GetEntry(selectedEvNum) for feature in featureNames: dataContainer[feature].append(tree.__getattr__(feature)) sys.stdout.write("\n") # Make the numpy arrays outputArray = np.array([]) for feature in dataContainer.keys(): column = dataContainer[feature] feature_vector = np.asarray(column) feature_vector = feature_vector.reshape(feature_vector.size,1) if outputArray.shape[0]==0: outputArray = feature_vector else: outputArray = np.concatenate((outputArray,feature_vector),axis=1) imp = Imputer(missing_values=-999, strategy='mean', axis=0) imp.fit(outputArray) outputArray = imp.transform(outputArray) print name print "Events: ",outputArray.shape[0] print "Features: ",outputArray.shape[1] return outputArray
def train_LSTM(path_to_store_weight_file=None, number_of_iteration=1): #HYPER-PARAMETERS input_size = 2436 output_size = 2 hidden_size = 100 num_layers = 1 batch_size = 151 #number of sequences I want to process in parallel num_epochs = 1 #train the data 1 time learning_rate = 0.1 #learning rate def flatten(list_): for el in list_: if hasattr(el, "__iter__") and not isinstance(el, basestring): for sub in flatten(el): yield sub else: yield el output, test_position = generate_sample_test() # generate_sample_test() # output= pickle.load(open("test_data.p" , "rb" )) # test_position = pickle.load(open("gt_data.p","rb")) list = [] for j in range(151): test_data = [] for i in range(6): for obj in vars(output[j][i])["players"]: for obj in vars(output[j][i])["players"]: test_data.append(obj.get_info()) test_data.append(vars(output[j][i])["quarter"]) test_data.append(vars(output[j][i])["game_clock"]) test_data.append(vars(output[j][i])["ball"].get_info()) test_data.append(vars(output[j][i])["shot_clock"]) list_1_scaled = [x for x in flatten(test_data)] list.append(list_1_scaled) data = pd.DataFrame(list) data_1 = data.copy() data_1 = data_1.values from sklearn.preprocessing import Imputer from sklearn.preprocessing import StandardScaler scaler = StandardScaler() imputer = Imputer(strategy="mean") imputer.fit(data_1) data_1 = imputer.transform(data_1) data_1_scaled = scaler.fit_transform(data_1) test_data = torch.DoubleTensor(np.array(data_1_scaled)) test_data = test_data.contiguous() test_position = torch.FloatTensor(np.array(test_position)) test_position = test_position.contiguous() y = Variable(test_position) x = test_data.view(batch_size, input_size) y = y.view(batch_size, output_size) model = RNN(input_size, hidden_size, num_layers, len(output)) print model optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) loss = torch.nn.MSELoss(size_average=False) #begin to train for epoch in range(number_of_iteration): # Pytorch accumulates gradients. We need to clear them out before each instance optimizer.zero_grad() model.hidden = model.init_hidden() # Also, we need to clear out the hidden state of the LSTM, # detaching it from its history on the last instance. out = model(x.unsqueeze(1).float()) #TINC EL PROBLEMA AQUI AMB EL TRAINING DATA err = loss(out, y) err.backward() optimizer.step() print('-------done LSTM') torch.save(model, path_to_store_weight_file)
#importing libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd #importing dataset dataset = pd.read_csv('Data.csv') features = dataset.iloc[:, :-1].values labels = dataset.iloc[:, 3].values #taking care of missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values="NaN", strategy="mean", axis=0) imputer = imputer.fit(features[:, 1:3]) features[:, 1:3] = imputer.transform(features[:, 1:3]) #encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder label_encoder_features = LabelEncoder() features[:, 0] = label_encoder_features.fit_transform(features[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) features = onehotencoder.fit_transform(features).toarray() label_encoder_labels = LabelEncoder() labels = label_encoder_labels.fit_transform(labels) #splitting data set into the Training set and Test set from sklearn.model_selection import train_test_split
def imputting_values(data): imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(data) imp.transform(data) return data
test_reader.get_number_of_examples()) del test_reader print "==> elapsed time = %.3f" % (time.time() - prev_time) print "train.shape ", train_X.shape, train_y.shape print "val.shape", val_X.shape, val_y.shape print "test.shape", test_X.shape, test_y.shape print "==> imputing missing values" imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print "==> normalizing data" scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) if not os.path.exists("cf_activations"): os.mkdir("cf_activations") if not os.path.exists("cf_results"):
# Data Preprocessing # Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # Taking care of missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer.fit(X[:, 1:3]) # Upperbound is excluded, imputer object is fitted to X X[:, 1:3] = imputer.transform(X[:, 1:3]) # Encoding categorical data from sklearn.preprocessing import LabelEncoder labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) # Categorical --> Dummy Encoding(one hot coding) from sklearn.preprocessing import LabelEncoder, OneHotEncoder onehotencoder = OneHotEncoder( categorical_features=[0]) # Specify the array number X = onehotencoder.fit_transform(X).toarray() labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y)
""" Spyder Editor This is a temporary script file. """ import pandas as pd import numpy as np import matplotlib.pyplot as plt dataset = pd.read_csv("C:\\Users\\Admin\\Downloads\\Data_Preprocessing\\Data.csv") X= dataset.iloc[:,:-1].values y= dataset.iloc[:,len(dataset.columns)-1].values from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X[:,1:3]) X[:,1:3]= imputer.transform(X[:,1:3]) from sklearn.preprocessing import LabelEncoder, OneHotEncoder oneHotEncoder = OneHotEncoder(categorical_features=[0]) labelEncoder_X= LabelEncoder() X[:,0] = labelEncoder_X.fit_transform(X[:,0]) X=oneHotEncoder.fit_transform(X).toarray() labelEncoder_y = LabelEncoder() y = labelEncoder_y.fit_transform(y) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=0) #Feature scaling from sklearn.preprocessing import StandardScaler
dataset.ix[100, 'isFlaggedFraud'] = np.NaN #check which row has NaN dataset[dataset['isFlaggedFraud'].isnull()] #impute mean value in the nan place #take important columns split train test dataset_pred = dataset[[ 'step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud' ]] X = dataset_pred.loc[:, dataset_pred.columns != 'isFraud'].values y = dataset_pred.iloc[:, 7].values #impute mean value imputer = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imputer = imputer.fit(X[:, 7:8]) X[:, 7:8] = imputer.transform(X[:, 7:8]) #see cor plot of numeric variables not working #sns.set(style="ticks", color_codes=True) #g = sns.pairplot(dataset_pred,hue="isFraud") #use encoder labelencoder_X_1 = LabelEncoder() X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) #labelencoder_X_2 = LabelEncoder() #X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) #create dummy variable for type 5 type as there are catagory not working onehotencoder = OneHotEncoder(categorical_features=[ 1 ]) #apply in column type as there are more than 2 catagory
# Using Linear Regression from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_1, y_1) # Predicting the results test_set = pd.read_csv('../dataset/test.csv') test_set_1 = test_set.iloc[:, [0, 1, 2, 3, 4, 5, 6, 8, 10]] X_test_1 = test_set_1.iloc[:, [1, 3, 4, 5, 6, 7, 8]].values from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X_test_1[:, [2, 5]]) X_test_1[:, [2, 5]] = imputer.transform(X_test_1[:, [2, 5]]) X_test_1[:, 1] = labelencoder_1.transform(X_test_1[:, 1]) X_test_1[:, 6] = labelencoder_2.transform(X_test_1[:, 6]) X_test_1 = onehotencoder_1.transform(X_test_1).toarray() X_test_1 = X_test_1[:, 1:] X_test_1 = onehotencoder_2.transform(X_test_1).toarray() X_test_1 = X_test_1[:, 1:] X_test_1 = sc_X.transform(X_test_1) y_pred = classifier.predict(X_test_1)
# In[12]: # again: our original array df.values # In[13]: # impute missing values via the column mean from sklearn.preprocessing import Imputer imr = Imputer(missing_values='NaN', strategy='mean', axis=0) imr = imr.fit(df.values) imputed_data = imr.transform(df.values) imputed_data # ## Understanding the scikit-learn estimator API # In[14]: # In[15]:
## seeing which explanatory feature rows got removed. Looks like none. response_series.index[~response_series.index.isin(explanatory_df.index)] ### now, let's seperate the numeric explanatory data from the string data string_features = explanatory_df.ix[:, explanatory_df.dtypes == 'object'] numeric_features = explanatory_df.ix[:, explanatory_df.dtypes != 'object'] # that are all NANs, as they will show up as all 'Nothing' when we start binning or look for features with no variation) string_features = string_features.fillna('Nothing') # cleaning up string features string_features = cleanup_data(string_features) # binarizing string features encoded_data = get_binary_values(string_features) ## imputing features imputer_object = Imputer(missing_values='NaN', strategy='median', axis=0) imputer_object.fit(numeric_features) numeric_features = pandas.DataFrame(imputer_object.transform(numeric_features), columns=numeric_features.columns) ## pulling together numeric and encoded data. explanatory_df = pandas.concat([numeric_features, encoded_data], axis=1) explanatory_df.head() #now, let's find features with no variance no_variation = find_zero_var(explanatory_df) explanatory_df.drop(no_variation['toDelete'], inplace=True) # deleting perfect correlation no_correlation = find_perfect_corr(explanatory_df) explanatory_df.drop(no_correlation['toRemove'], 1, inplace=True)
#plt.xlabel('numbers of features to keep') #plt.ylabel('ratio of information remains') #plt.annotate('Point(%d,%.2f)' % (10,variances[9]), xy=(10, variances[9]), # xytext=(+10, +0.7), fontsize=15, # arrowprops=dict(arrowstyle="->")) #plt.show() pca = PCA(n_components=15) hy_compressed = pca.fit_transform(hy_dummies) hy_compressed_df = pd.DataFrame(hy_compressed, columns=list( ['hy' + str(x) for x in range(1, 16)])) entbase = entbase.join(hy_compressed_df) #ZCZB imp_nan = Imputer(missing_values='NaN', strategy='median', axis=0) imp_nan.fit(entbase.loc[:, ['ZCZB']]) entbase.loc[:, ['ZCZB']] = imp_nan.transform(entbase.loc[:, ['ZCZB']]) imp_0 = Imputer(missing_values=0, strategy='median', axis=0) imp_0.fit(entbase.loc[:, ['ZCZB']]) entbase.loc[:, ['ZCZB']] = imp_0.transform(entbase.loc[:, ['ZCZB']]) scaler = StandardScaler() scaler.fit(entbase['ZCZB']) entbase['ZCZB'] = scaler.transform(entbase['ZCZB']) #ETYPE etype_compressed = pd.get_dummies(entbase['ETYPE']) etype_compressed_df = pd.DataFrame( np.array(etype_compressed), columns=list(['etype' + str(x) for x in sorted(entbase['ETYPE'].unique())])) entbase = entbase.join(etype_compressed_df)
import pandas as pd import numpy as np import matplotlib.pyplot as plt dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values from sklearn.preprocessing import Imputer imputer = Imputer(missing_values=np.nan) imputer_out = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) print(X) from sklearn.preprocessing import LabelEncoder, OneHotEncoder onehotencoder_X = OneHotEncoder(categorical_features=[0]) X = onehotencoder_X.fit_transform(X).toarray() labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y)
import pandas base = pandas.read_csv('E:\\Udemy - Cursos\\MachineLearning\\Arquivos\\CreditData.csv') base.loc[base.age < 0, 'age'] = 40.92 previsores = base.iloc[:, 1:4].values classe = base.iloc[:, 4].values from sklearn.preprocessing import Imputer imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) imputer = imputer.fit(previsores[:, 1:4]) previsores[:, 1:4] = imputer.transform(previsores[:, 1:4]) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() previsores = scaler.fit_transform(previsores) from sklearn.cross_validation import train_test_split previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.25, random_state=0) from sklearn.tree import DecisionTreeClassifier # importação da biblioteca classificador = DecisionTreeClassifier(criterion = 'entropy', random_state = 0) # criação do classificador e RANDOM faz com que se use sempre as mesmas porções da base de dados classificador.fit(previsores_treinamento, classe_treinamento) previsoes = classificador.predict(previsores_teste) from sklearn.metrics import confusion_matrix, accuracy_score precisao = accuracy_score(classe_teste, previsoes) matriz = confusion_matrix(classe_teste, previsoes)
boy=180 def kosmak(self,b): return b+10 ali = people() print(ali.boy) print(ali.kosmak(90)) #sci-kit learn = sklearn from sklearn.preprocessing import Imputer imputer = Imputer(missing_values="NaN" , strategy = "mean", axis=0) Yas = veriler.iloc[:,1:4].values print(Yas) imputer=imputer.fit(Yas[:1,1:4]) Yas[:,1:4] = imputer.transform(Yas[:,1:4]) print("\n") print(Yas) ulke = veriler.iloc[:,0:1].values print(ulke) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() ulke[:,0] = le.fit_transform(ulke[:,0]) print(ulke) from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(categorical_features="all") ulke=ohe.fit_transform(ulke).toarray() print(ulke)
# importing libraries import pandas as pd # importing dataset # create matrix of independent variables(features) data_X = pd.read_csv('secom.data.txt', sep=' ') X = data_X.values #create dependent variable vector data_y = pd.read_csv('secom_labels.data.txt', sep=' ') y = data_y.iloc[:, 0].values # handling missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X) X = imputer.transform(X) # L1 based feature selection from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression # SelectFromModel and Logistic Regression logreg = LogisticRegression(random_state=0) logreg.fit(X, y) model = SelectFromModel(logreg) X_logreg = model.fit(X, y) #X_logreg.shape # Get indices of selected features
import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1] X = X.iloc[:, 1:].values y = dataset.iloc[:, 3].values # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) sc_y = StandardScaler() y_train = sc_y.fit_transform(y_train) # Taking care of missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X[:, 0:2]) X[:, 0:2] = imputer.transform(X[:, 0:2]) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) """PRATEEK"""
from sklearn.preprocessing import Imputer import numpy as np from sklearn.decomposition import PCA data = np.loadtxt("secom.data") imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(data) data = imp.transform(data) pca = PCA(n_components=6) pca.fit(data) print(pca.explained_variance_ratio_) print(pca.singular_values_) print(np.shape(pca.components_))
#importation de package import statsmodels as stat import seaborn as sbrn import pandas as pds import matplotlib.pyplot as mplt import numpy as np dtst = pds.read_csv("credit_immo.csv") X = dtst.iloc[:, -9:-1].values Y = dtst.iloc[:, -1].values #data cleaning from sklearn.preprocessing import Imputer imptr = Imputer(missing_values='NaN', strategy='mean', axis=0) imptr.fit(X[:, 0:1]) imptr.fit(X[:, 7:8]) X[:, 0:1] = imptr.transform(X[:, 0:1]) X[:, 7:8] = imptr.transform(X[:, 7:8]) #données categoriques ## Codage de la variable independant from sklearn.preprocessing import LabelEncoder, OneHotEncoder labEncre_X = LabelEncoder() X[:, 2] = labEncre_X.fit_transform(X[:, 2]) X[:, 5] = labEncre_X.fit_transform(X[:, 5]) onehotEncr = OneHotEncoder(categorical_features=[2]) onehotEncr = OneHotEncoder(categorical_features=[5]) X = onehotEncr.fit_transform(X).toarray()
Churning_train = "train/churn_train.csv" Churning_test = "test/churn_train.csv" Churning_pred = "Validation/FinalPred.csv" def load_churning_data(datapath): return pd.read_csv(datapath) #load the dataset Churning_train_dataset = load_churning_data(Churning_train) Churning_train_dataset.head(100) Churning_train_dataset.info() #finding the data distribution in various states Churning_train_dataset['st'].value_counts() Churning_train_dataset.describe() #find the variable distribution Churning_train_dataset.hist(bins=50, figsize=(20, 15)) Churning_train_dataset.nummailmes = Churning_train_dataset.nummailmes.replace( 0, np.NaN) Churning_train_dataset.hist(bins=50, figsize=(20, 15)) #finding correlation matrix corr_matrix = Churning_train_dataset.corr() #creating imputer object Churning_train_dataset.nummailmes.fillna(0) imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(Churning_train_dataset.iloc[:, 5].reshape(-1, 1)) Churning_train_dataset.iloc[:, 5] = imputer.transform( Churning_train_dataset.iloc[:, 5].reshape(-1, 1)).reshape(-1)
print("Join with store") train = pd.merge(train, store, on='Store') test = pd.merge(test, store, on='Store') features = [] imp = Imputer(missing_values=0, strategy="median", axis=0) scl = StandardScaler(copy=True, with_mean=True, with_std=True) print("augment features") build_features(features, train) build_features([], test) print(features) imp.fit(train[features]) scl.fit(train[features]) imp.fit(test[features]) scl.fit(test[features]) train[features] = imp.transform(train[features]) test[features] = imp.transform(test[features]) train[features] = scl.transform(train[features]) test[features] = scl.transform(test[features]) params = {"objective": "reg:linear", "eta": 0.3, "max_depth": 8, "subsample": 0.7,
from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import Imputer from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) imputer = Imputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) # Score on the training set was:0.9931558441558442 exported_pipeline = make_pipeline( make_union( FunctionTransformer(copy), FunctionTransformer(copy) ), GaussianNB() ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def classifier(training_data,data_false,directory,priori_prob,shape_y,f_type): #function to compute the classifier for a label and then save it '''this function creates the initial 1st classifier.''' print '\n' #print 'Company is :',directory #for i in range(4): #training_dataf= np.asarray(training_dataf) #print 'shape of false data',data_false.shape #print 'shape of training_dataf',training_dataf.shape training_dataf= np.asarray(data_false) #training_dataf = training_dataf[:count1,:] #false training data so that both havesame size r1,c1 = training_data.shape r2,c2 = training_dataf.shape label_true = [] label_false = [] #--creating labels for true and false data--# for m in range(r1): label_true.append(1) for n in range(r2): label_false.append(0) label_true = np.asarray(label_true) label_false = np.asarray(label_false) #print 'b4imputer' #--removing nans by the medians--# imp = Imputer(strategy = 'median') imp.fit(training_data) training_data = imp.transform(training_data) imp.fit(training_dataf) training_dataf = imp.transform(training_dataf) #print 'after' #--final training data---# final_training = np.concatenate((training_data,training_dataf)) temp3,temp4 = final_training.shape #----------creating labels for final_training------------# final_labels= np.concatenate((label_true,label_false)) #print 'shape of ifnal ddata',final_labels.shape,final_training.shape #--generating testing and training data randomly--# #X_train, X_test, y_train, y_test = train_test_split(final_training, final_labels, train_size=0.80, random_state=42) #X_train,y_train,f2 = split_new(final_training,final_labels,0.8) #split the training and testing data #--creating instance of random forest---# #print 'final training' X_train = final_training y_train = final_labels temp1,temp2 = X_train.shape #print 'teri makk',temp1, temp2 est = RandomForestClassifier(n_estimators =20,max_features='auto',max_depth=None,min_samples_split=2,min_samples_leaf=1,min_weight_fraction_leaf=0,max_leaf_nodes=None,n_jobs=1) #--fitting data and labels--# est.fit(X_train,y_train) #make trees from trainning data and labels #x_train is the training data, and y_train are there labels. #print 'score',est.score(X_test,y_test) Location = classi+f_type #print 'Location',Location try : os.stat(Location) except : os.mkdir(Location) save_location = Location+'/'+directory+'_'+str(0)+'.pkl' #print 'shape',test_data.shape joblib.dump(est, save_location,compress=9)#only save the classifier not the data.. #0 is sent to check the recusrion depth ret = re_train_prefilt(save_location,directory,X_train,y_train,shape_y,0,f_type)
# Normalizzazione dei dati # elimino una colonna x_train = x_train.drop('Cabin', 1) x_test = x_test.drop('Cabin', 1) x_train = x_train.drop('Ticket', 1) x_test = x_test.drop('Ticket', 1) x_train = x_train.drop('Name', 1) x_test = x_test.drop('Name', 1) print('Errori') print(x_train.isnull().sum()) from sklearn.preprocessing.imputation import Imputer imr = Imputer(missing_values='NaN', strategy='mean', axis=1) imr = imr.fit(x_train) imputed_data = imr.transform(x_train.values) print('trasf') print(imputed_data[:200]) imr = Imputer(missing_values='NaN', strategy='mean', axis=1) imr = imr.fit(x_test) imputed_data2 = imr.transform(x_test.values) print('trasf') print(imputed_data2[:200]) std = StandardScaler() x_train_std = std.fit_transform(imputed_data) x_test_std = std.fit_transform(imputed_data2) print(x_train_std)
print(df.columns) print(type(df)) # Replacing Y with 1 and N with 0 df = df.replace('y', 1) df = df.replace('n', 0) df = df.replace('republican', 1) df = df.replace('democrat', 0) #print('After replacement:', df) df = df.replace('?', np.NaN) # Replace missing value with NaN print('NaN replaced data set: ', df) #print(df.isnull().head()) imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(df, (435, 17)) df_clean = imp.transform(df) print('Clean Data Set:', df_clean) print(df_clean.shape) print(type(df_clean)) column_list = [ 'republican', 'handicapped-infants', 'water-project-cost-sharing', 'adoptionof-the-budget-resolution', 'physician-fee-freeze', 'el-salvador-aid', 'eligious-groups-inschools', 'anti-satellite-test-ban', 'aid-to-nicaraguan-contras', 'mx-missile', 'immigration', 'synfuelscorporation-cutback', 'education-spending', 'superfund-right-to-sue', 'crime', 'dutyfree-exports', 'export-administration-act-south-africa' ]
if __name__ == '__main__': impt = Imputer() scal = MinMaxScaler() train = pd.read_csv("broadband_train.csv", sep=",", encoding='gbk') train['GENDER'] = train['GENDER'].replace('男', 0).replace('女', 1) train['AUTOPAY'] = train['AUTOPAY'].replace('否', 0).replace('是', 1) train = train.apply(lambda s: format_series(s, True)) test = pd.read_csv("broadband_test.csv", sep=",", encoding='utf-8') test['GENDER'] = test['GENDER'].replace('男', 0).replace('女', 1) test['AUTOPAY'] = test['AUTOPAY'].replace('否', 0).replace('是', 1) test = test.apply(lambda s: format_series(s, False)) train_X = train.iloc[:, 1:-1] train_Y = train.iloc[:, -1] test_X = test.iloc[:, 1:] impt.fit(train_X) train_X = impt.transform(train_X) test_X = impt.transform(test_X) scal.fit(train_X) train_X = scal.transform(train_X) test_X = scal.transform(test_X) model = svm.SVC() model.fit(train_X, train_Y) print(cross_val_score(model, train_X, train_Y)) data = model.predict(test_X) print(data) #[ 0.83532934 0.84984985 0.81927711] #[ 0.83532934 0.84984985 0.81927711] #[ 0.83532934 0.84984985 0.81927711]
#dataset.drop('dma', axis=1, inplace=True) #print("column number") #print(dataset.columns,len(dataset.columns),len(dataset.index)) dt = dataset.values d = dt.astype(float) #print("Checkinf for NaN and Inf") #print( "np.nan=", np.where(np.isnan(d))) #print( "is.inf=", np.where(np.isinf(d))) # print("********************************************") imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(d) d = imp.fit_transform(d) ##print("values after encoding", values) # normalize features scaler = MinMaxScaler(feature_range=(0, 1)) scaled = scaler.fit_transform(d) ##print("scaled values",scaled) # specify the number of lag hours n_hours = 4 n_features = len(dataset.columns) n_ahead = 1 st = n_hours*n_features # frame as supervised learning reframed = series_to_supervised(scaled, n_hours, n_ahead) #print("column number")
cols = list(dataset.columns.values) cols.pop(cols.index('revenue_class')) dataset = dataset[cols+['revenue_class']] dataset.drop(dataset.columns[-10], axis=1, inplace = True) dataset.drop(dataset.columns[-24], axis=1, inplace = True) # dependent and inpendent variables X = dataset.iloc[:, 1:-1].values y = dataset.iloc[:, -1].values # Taking care of missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) imputer = imputer.fit(X[:,:]) X[:,:] = imputer.transform(X[:,:]) from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(.8 * (1 - .8))) X = sel.fit_transform(X) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test)
class DataSetBuilder: ''' a Class build to combine feature extraction for catagories, text and numeric data Defaults to using mutliprocessing use if __name__ == '__main__': before fit and transform calls on windows ''' def __init__(self, params=None, col_dict=None): ''' :param params: :param col_dict: dictionary with keys 'cat_cols', text_cols', 'imputer_cols'. 'zero_imputer_cols', the values are the column names in a pandas data frame to prepreocess ''' from nltk.corpus import stopwords self.default_params = {'text_cols': {'max_features': 200, 'min_freq': 0.001, 'ngram_range': (1, 1), 'min_len': 3, 'stop_words': set(stopwords.words('english'))}, 'cat_cols': {'min_freq': 0.01}, 'imputer_cols': {'strategy': 'median'}} if params is None: self.params = self.default_params else: self.update_params(params) self.par = True self.cat_encoder = None self.text_encoder = None self.col_dict = col_dict self.imputer = None self.feature_names = [] def update_params(self, params): new_params = self.default_params for p in params.keys(): temp_params = params[p] for pp in temp_params.keys(): new_params[p][pp] = temp_params[pp] self.params = new_params def fit(self, data): ''' :param data: pandas data frame containing all the columns listed in col_dict :return: none, encoders are saved within class ''' col = 'text_cols' if col in self.col_dict.keys(): print('fitting', col, ':', self.col_dict[col]) self.text_encoder = TextFeature() self.text_encoder.par = self.par self.text_encoder.max_features = self.params[col]['max_features'] self.text_encoder.min_freq = self.params[col]['min_freq'] self.text_encoder.ngram_range = self.params[col]['ngram_range'] self.text_encoder.min_len = self.params[col]['min_len'] self.text_encoder.stop_words = self.params[col]['stop_words'] self.text_encoder.fit(data[self.col_dict[col]]) self.feature_names = self.feature_names + self.text_encoder.feature_names col = 'cat_cols' if col in self.col_dict.keys(): print('fitting', col, ':', self.col_dict[col]) self.cat_encoder = CatEncoder() self.cat_encoder.par = self.par self.cat_encoder.min_freq = self.params[col]['min_freq'] self.cat_encoder.fit(data[self.col_dict[col]]) self.feature_names = self.feature_names + self.cat_encoder.feature_names col = 'imputer_cols' if col in self.col_dict.keys(): print('fitting', col, ':', self.col_dict[col]) from sklearn.preprocessing import Imputer self.imputer = Imputer(strategy=self.params[col]['strategy']) self.imputer.fit(data[self.col_dict[col]]) self.feature_names = self.feature_names + self.col_dict[col] col = 'zero_imputer_cols' if col in self.col_dict.keys(): self.feature_names = self.feature_names + self.col_dict[col] def transform(self, data): ''' :param data: a pandas data frame with all the columns listed in col_dict :return: scipy sparse matrix of features ''' from scipy import sparse self.cat_encoder.par = self.text_encoder.par = self.par output_list = [] col = 'text_cols' if col in self.col_dict.keys(): output_list.append(self.text_encoder.transform(data[self.col_dict[col]])) print('transforming', col, ':', self.col_dict[col]) col = 'cat_cols' if col in self.col_dict.keys(): print('transforming', col, ':', self.col_dict[col]) output_list.append(self.cat_encoder.transform(data[self.col_dict[col]])) col = 'imputer_cols' if col in self.col_dict.keys(): print('transforming', col, ':', self.col_dict[col]) output_list.append(sparse.csr_matrix(self.imputer.transform(data[self.col_dict[col]]))) col = 'zero_imputer_cols' if col in self.col_dict.keys(): import pandas as pd print('transforming', col, ':', self.col_dict[col]) output_list.append(sparse.csr_matrix(data[self.col_dict[col]].fillna(0))) output = sparse.hstack(output_list) return output
def train(train_X, train_Y, feature_names): imp = Imputer(missing_values='NaN', strategy='median', axis=0) enc = OneHotEncoder(categorical_features=np.array([65, 66]), sparse=False, n_values=80) imp.fit(train_X) train_X = imp.transform(train_X) """ enc.fit(train_X); train_X = enc.transform(train_X); """ print("No of features : " + str(len(train_X[0]))) train_Y = np.array(train_Y) dtrain = xgb.DMatrix(train_X, label=train_Y) parameters_to_try = generateParams() best_params = None overall_best_auc = 0 overall_best_nrounds = 0 for i in range(0, len(parameters_to_try)): param = parameters_to_try[i] num_round = 2000 bst_cv = xgb.cv(param, dtrain, num_round, nfold=20, metrics={'auc'}, show_stdv=False, seed=0) best_iteration = 0 best_auc = 0 for i in range(0, len(bst_cv)): eval_result = bst_cv[i].split("\t") val_auc = float(eval_result[1].split(":")[1]) if val_auc > best_auc: best_auc = val_auc best_iteration = int(eval_result[0].replace("[", "").replace( "]", "")) print("\n Best AUC : " + str(best_auc) + " for Params " + str(param) + " occurs at " + str(best_iteration)) if best_auc > overall_best_auc: overall_best_auc = best_auc best_params = copy.copy(param) overall_best_nrounds = best_iteration print( "\n Training the model on the entire training set with the best params" ) bst = xgb.train(best_params, dtrain, overall_best_nrounds) print("\n\n Overall Best AUC : " + str(overall_best_auc) + " for Params " + str(best_params) + " occurs at " + str(best_iteration)) feature_imp = bst.get_fscore() print("Feature Importance ... ") for w in sorted(feature_imp, key=feature_imp.get, reverse=True): print( str(feature_names[int(w.replace("f", ""))]) + " : " + str(feature_imp[w])) return bst, imp, enc
import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Wine_Quality_Data.csv') dataset.describe() dataset.hist() #import seaborn as sns X = dataset.iloc[:, :-1].values # Taking care of missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X[:, [0]]) X[:, [0]] = imputer.transform(X[:, [0]]) pd.set_option('precision', 3) cor = dataset.corr(method='pearson') #Selected these features using feature_importances method after applying Random Classification Method X = dataset.iloc[:, [5, 6]].values # Using the elbow method to find the optimal number of clusters from sklearn.cluster import KMeans wcss = [] for i in range(1, 11): kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0) kmeans.fit(X) wcss.append(kmeans.inertia_) plt.plot(range(1, 11), wcss) plt.title('The Elbow Method')