def remove_and_correct_outliers(data): ##is data in a normal distribution?? b_constant = 1.4826 ##constant used for normal distribution factor = 10 #3 ##factor to multiply for the range count = 0 for i in range(0, len(data[0].values)): ##iterate through all features, in voce case 6125 d_s, d_ns, _, _ = utils.get_utterance_values_of_ith_utterance(data, i) ##get all feature values d = d_s + d_ns ##join them together, since the fucntion returns different arrays for stress or not stress f_vals = np.array(d, dtype=float) ##transform list into np array median = np.median(f_vals) ##get the median diff = (f_vals - median)**2 ##subtract median to every element and **2 to get all values to positive diff = np.sqrt(diff) ## eliminate the **2 trick to avoid negatives med_abs_deviation = np.median(diff) ##get the new mean threshold = med_abs_deviation * b_constant ##raange of value to be accepted max_range = median + threshold * factor min_range = median - threshold * factor for j in range(0, len(f_vals)): ##mark values that are outside the bounderies as outliers if f_vals[j] < min_range or f_vals[j] > max_range: count += 1 f_vals[j] = np.nan imp = Imputer(missing_values=np.nan, strategy='mean', axis=1) f_vals = imp.fit_transform(f_vals)[0] for j in range(0, len(f_vals)): data[j].values[i] = round(f_vals[j],6) print "Detected ", count, " outliers" return data
def get_modelKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) # benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values="NaN", strategy="mean", axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2, 7): # Log.info("H2O K-Means") km_h2o = H2OKMeansEstimator(k=i) km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o) km_h2o.show() model = h2o.get_model(km_h2o._id) model.show() km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_
def get_features(frame): ''' Transforms and scales the input data and returns a numpy array that is suitable for use with scikit-learn. Note that in unsupervised learning there are no labels. ''' # Replace missing values with 0.0 # or we can use scikit-learn to calculate missing values below #frame[frame.isnull()] = 0.0 # Convert values to floats arr = np.array(frame, dtype=np.float) # Impute missing values from the mean of their entire column from sklearn.preprocessing import Imputer imputer = Imputer(strategy='mean') arr = imputer.fit_transform(arr) # Normalize the entire data set to mean=0.0 and variance=1.0 from sklearn.preprocessing import scale arr = scale(arr) return arr
def data_organizer( instances, outcomes ): """ Operations to organize data as desired """ # Remove instances without GPA data new_instances = [] new_outcomes = [] for instance,outcome in zip(instances,outcomes): u1,u2,gpa = outcome if not math.isnan( gpa ): new_instances.append( [value for value in instance] ) new_outcomes.append( [value for value in outcome] ) instances = new_instances outcomes = new_outcomes # Fill in NaN values with median instance_list = [] for idx,instance in enumerate(instances): instance_list.append( [ value for value in instance ] ) bandaid = Imputer( strategy='median' ) instances = bandaid.fit_transform( instance_list ) return instances, outcomes
def run_whole_video(exp_folder, lims_ID): #initializes video pointer for video of interest based on lims ID file_string = get_file_string(exp_folder, lims_ID) video_pointer = cv2.VideoCapture(file_string) # import wheel data wheel = joblib.load('dxds2.pkl') first_non_nan = next(x for x in wheel if not isnan(x)) first_index = np.where(wheel == first_non_nan)[0] k = first_index[0] imp = Imputer(missing_values='NaN', strategy='mean') wheel = imp.fit_transform(wheel) wheel = preprocessing.MinMaxScaler((-1, 1)).fit(wheel).transform(wheel) # self.video_pointer.set(1, 41000) ret, frame = video_pointer.read() # crops and converts frame into desired format frame = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY) prvs = frame nex = frame # initialize vectors to keep track of data count = 0 mod = 0 opticals = [] angles = [] frames = [] # length of movie limit = int(video_pointer.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT)) # create hdf file hf = h5py.File('data_' + str(lims_ID) + '.h5', 'w') g = hf.create_group('feature space') vector = np.zeros((limit, 4321)) table = g.create_dataset('features', data = vector, shape =(limit, 4321)) while count <= limit: prvs = nex frames = process_input(prvs) ret, frame = video_pointer.read() nex = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY) optical = optical_flow(prvs, nex) opticals = optical['mag'] angles= optical['ang'] vector_data = np.concatenate((np.reshape(wheel[k], (1)), frames, opticals, angles)) table[count, :] = vector_data count += 1 if count%1000 == 0: print (count)
def process(discrete, cont): # Create discrete and continuous data matrices discrete_X = np.array(discrete) cont_X = np.array(cont) # Impute discrete values imp = Imputer(strategy='most_frequent') discrete_X = imp.fit_transform(discrete_X) # Impute continuous values imp_c = Imputer(strategy='mean') cont_X = imp_c.fit_transform(cont_X) # Discrete basis representation enc = OneHotEncoder() enc.fit(discrete_X) discrete_X = enc.transform(discrete_X).toarray() # Continuous scaling scaler = StandardScaler() scaler.fit(cont_X) cont_X = scaler.transform(cont_X) # Merge to one array X = np.concatenate((discrete_X, cont_X), axis=1) return X
def impute_and_scale(df, scaling='std'): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ df = df.dropna(axis=1, how='all') imputer = Imputer(strategy='mean', axis=0) mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': return pd.DataFrame(mat, columns=df.columns) if scaling == 'maxabs': scaler = MaxAbsScaler() elif scaling == 'minmax': scaler = MinMaxScaler() else: scaler = StandardScaler() mat = scaler.fit_transform(mat) df = pd.DataFrame(mat, columns=df.columns) return df
def benignKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) from h2o.estimators.kmeans import H2OKMeansEstimator for i in range(1,7): benign_h2o_km = H2OKMeansEstimator(k=i) benign_h2o_km.train(x = range(benign_h2o.ncol), training_frame=benign_h2o) print "H2O centers" print benign_h2o_km.centers() benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1) benign_sci_km.fit(benign_sci) print "sckit centers" print benign_sci_km.cluster_centers_
def learn(): global classifier, INPUT print 1 data = np.genfromtxt(INPUT, delimiter=' ', dtype='f8') np.random.shuffle(data) n = len(data) y = data[:,1] x = data[:][:,range(2,54)] # test_x = [] # test_y = [] train_x = [] train_y = [] print 2 imp = Imputer(missing_values='NaN', strategy='mean', axis=0) x = imp.fit_transform(x) print 3 for i in range(0, n): if y[i] == 0: continue train_x.append(x[i]) train_y.append(y[i]) # if i%100==0: # test_x.append(x[i]) # test_y.append(y[i]) # else: # train_x.append(x[i]) # train_y.append(y[i]) print 4 classifier.fit(train_x, train_y) print 5
def fit(self, train_x, train_y=None, is_norm=True): # Normalization if is_norm: train_x_min = train_x.min(0) train_x_ptp = train_x.ptp(axis=0) train_x = train_x.astype(float) - train_x_min / train_x_ptp if np.any(train_y): train_y = train_y.astype(float) - train_x_min / train_x_ptp imp = Imputer(missing_values='NaN', strategy='mean', axis=1) imp.fit(train_x) if np.isnan(train_x).any(): log("Found {} NaN values in train_x, so try to transform them to 'mean'".format(np.isnan(train_x).sum()), WARN) train_x = imp.transform(train_x) if np.any(train_y) and np.isnan(train_y).any(): log("Found {} NaN values in train_y, so try to transform them to 'mean'".format(np.isnan(train_y).sum()), WARN) train_y = imp.transform(train_y) if np.any(train_y): self.model.fit(train_x, train_y) else: self.model.fit(train_x)
def preprocess(data): non_sparse_only = True use_all_category_only = False use_all_impute_mean_mode = False if non_sparse_only: nominal_samples = data.ix[:,['var4','dummy']] onehot_samples = onehot.transform(nominal_samples,['var4','dummy']) onehot_samples = pd.DataFrame(onehot_samples.toarray()) numbered_samples = data.ix[:,['var7','var8','var10','var11','var13','var15','var17']] numbered_samples[['var7','var8']] = numbered_samples[['var7','var8']].convert_objects(convert_numeric=True) #(var7 and 8 are ordinal, converting to floats which includes NaNs will allow mean imputing of missing values) other_samples = data.ix[:,'crimeVar1':'weatherVar236'] #all the continuous vars other_samples = other_samples.drop(['weatherVar115'], axis=1) #nothing in this feature samples = pd.concat([onehot_samples,numbered_samples,other_samples],axis=1) #combine w/ the cleaned up other vars imp_nan = Imputer(missing_values=np.nan, strategy='mean', axis=0) samples_imp = imp_nan.fit_transform(samples) if use_all_category_only: todo if use_all_impute_mean_mode: todo return samples_imp
def run_main(new_file, start, stop, dat): with open(new_file, 'a') as file: imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=1) import itertools with open(dat, "r") as text_file: for line in itertools.islice(text_file, start, stop): line = line.replace("NA", "NaN") content = line.rstrip('\n').split('\t') CpG = content.pop(0) flag, CpG_location = get_location(CpG) if flag == 'F': continue genotype_matrix = get_genotypes(CpG_location) genotype_matrix = imp.transform(genotype_matrix) genotype_matrix = genotype_matrix.transpose() #run PCA try: PCA_matrix = run_pca(genotype_matrix) except ValueError: print "value error" continue #run linear regression meth_values = pd.Series(content, name="meth_val", dtype=float) model = sm.OLS(meth_values, PCA_matrix) results = model.fit() MethValResids = results.resid final = pd.Series(CpG) final = final.append(MethValResids) fline = final.tolist() fline = '\t'.join(str(x) for x in fline) fline = fline + "\n" file.write(fline)
def test_3_stage(self): from sklearn.preprocessing import Imputer infile_name = path_of_data('missing_vals.csv') p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv'))) impute_node = p.add(wrap_and_make_instance(Imputer)) csv_read_node['output'] > impute_node['X_train'] impute_node['X_new'] > csv_write_node['input'] self.run_pipeline(p) ctrl_imputer = Imputer() ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",", names=True) num_type = ctrl_X_sa[0][0].dtype ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa) ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd) control = ctrl_X_new_nd result = self._tmp_files.csv_read('out.csv', True) self.assertTrue(np.allclose(result, control))
def imputed_data(df, colname, strategy="mean"): from sklearn.preprocessing import Imputer imr = Imputer(missing_values="NaN", strategy=strategy, axis=0) imr = imr.fit(df[colname].reshape(-1,1)) imputed_data = imr.transform(df[colname].values.reshape(-1,1)) df[colname] = imputed_data print("Data has been imputed to \"{}\"".format(colname))
class ImputeCategorical(BaseEstimator, TransformerMixin): """ Encodes a specified list of columns or all columns if None. """ def __init__(self, columns=None): self.columns = columns self.imputer = None def fit(self, data, target=None): """ Expects a data frame with named columns to impute. """ # Encode all columns if columns is None if self.columns is None: self.columns = data.columns # Fit an imputer for each column in the data frame self.imputer = Imputer(missing_values=0, strategy='most_frequent') self.imputer.fit(data[self.columns]) return self def transform(self, data): """ Uses the encoders to transform a data frame. """ output = data.copy() output[self.columns] = self.imputer.transform(output[self.columns]) return output
def impute_missing_data(datapoints, strategy='mean'): """ Inputes values for the 8 features missing data Arguments: datapoints -- X, a dataset with missing values represented 999.0 and 9999.0 strategy [optional] -- an imputation strategy, e.g., mean, median, or most_frequent Returns: X_imputed -- a dataset with missing values imputed according to the provided or default (mean) strategy. Uses the scikit-learn Imputer class. """ # First we will replace our placeholder values with NaN to only have # to run one imputation. np.putmask(datapoints, datapoints == 999.0, np.NaN) np.putmask(datapoints, datapoints == 9999.0, np.NaN) # Now create an imputer over NaN values, and average over axis=0 (columns) # Then, fit the imputer to the dataset. imp = Imputer(strategy=strategy, axis=0) X_imputed = imp.fit_transform(datapoints) return X_imputed
def load_datasets(feature_paths, label_paths): ''' 读取特征文件和标签文件并返回 ''' #定义feature数组变量,列数量和特征维度一致为41;定义空的标签变量,列数量与标签维度一致为1 feature = np.ndarray(shape=(0,41)) label = np.ndarray(shape=(0,1)) for file in feature_paths: #使用pandas库的read_table函数读取一个特征文件的内容,其中指定分隔符为逗号、缺失值为问号且文件不包含表头行 #df = pd.read_table(file, delimiter=',', na_values='?', header=None) #pandas.read_csv(数据源, encoding=编码格式为utf-8, parse_dates=第0列解析为日期, index_col=用作行索引的列编号) data=pd.read_csv(file,encoding='utf-8',parse_dates=[0],index_col=0) #DataFrame.sort_index(axis=0 (按0列排), ascending=True(升序), inplace=False(排序后是否覆盖原数据)) #data 按照时间升序排列 #data.sort_index(0,ascending=True,inplace=True) #使用Imputer函数,通过设定strategy参数为‘mean’,使用平均值对缺失数据进行补全。 imp = Imputer(missing_values='NaN', strategy='mean', axis=0) #fit()函数用于训练预处理器,transform()函数用于生成预处理结果。 imp.fit(df) df = imp.transform(df) #将预处理后的数据加入feature,依次遍历完所有特征文件 feature = np.concatenate((feature, df)) #读取标签文件 for file in label_paths: df = pd.read_table(file, header=None) label = np.concatenate((label, df)) #将标签归整化为一维向量 label = np.ravel(label) return feature, label
def impute_missing_train(dataframe, missing_values='NaN', strategy='mean'): ''' Given a dataframe, imputes missing values with a given strategy. Supported strategies: 'mean', 'median', 'most_frequent'. Returns dictionary mapping transformed columns to its imputer value. ''' from sklearn.preprocessing import Imputer imp = Imputer(missing_values=missing_values, strategy=strategy, axis=0) imputed = imp.fit_transform(dataframe) df = pd.DataFrame(imputed) df.columns = list(dataframe.columns) imputers = {} if strategy == 'mean': for col in df.columns: mean = df[col].mean() imputers[col] = mean if strategy == 'median': for col in df.columns: median = df[col].median() imputers[col] = median if strategy == 'most_frequent': for col in df.columns: mode = df[col].mode() imputers[col] = mode return df, imputers
def Train_And_Test(self): HOG_data=np.loadtxt('dataset.csv',delimiter=",") tmpdata=HOG_data[:,0:-2] target=HOG_data[:,-2] print(target) tmpdata[tmpdata==0]=np.nan imp=Imputer(missing_values='NaN',strategy='mean') data=imp.fit_transform(tmpdata) data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.3) model=SVC(C=1.0,gamma=0.0,kernel='linear', class_weight='auto') model.fit(data_train,target_train) print(data_train) print(target_train) opencv_data_train=np.float32(data_train) opencv_target_train=np.float32(target_train) svm_params = dict( kernel_type = cv2.SVM_LINEAR, svm_type = cv2.SVM_C_SVC, C=2.67, gamma=5.383) svm = cv2.SVM() svm.train(opencv_data_train,opencv_target_train, params=svm_params) svm.save("hog_classifier.xml") print(model) expected=target_test predicted=model.predict(data_test) target_names = ['Not Human', 'Human'] print(metrics.classification_report(expected,predicted,target_names=target_names)) print(metrics.confusion_matrix(expected,predicted)) print(metrics.roc_curve(expected,predicted)) pickle.dump(model, open( "svm.p", "wb" ) )
def avg_message_count_by_group(df_users, df_messages, df_user_features): columns = ["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10"] features = df_user_features[list(columns)].values # Impute missing values to retain all sample data imp = Imputer(missing_values='NaN', strategy='mean', axis=0) X = imp.fit_transform(features) # Preprocess dataset and standardize features to have normally distributed data # MaxAbsScaler allows scaled features to lie between -1 and +1 X = MaxAbsScaler().fit_transform(X) # Apply PCA decomposition and use first 3 components that explain 75% of variance reduced_data = decomposition.PCA(n_components=3).fit_transform(X) kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10) # Predict which group each user belongs to cluster_labels = kmeans.fit_predict(reduced_data) df_user_features['group.id'] = cluster_labels # Call utility function to join the two dataframes df_joined_users_messages = get_merged_dataframes(df_users, df_messages) df_joined_users_messages_features = get_merged_dataframes(df_user_features, df_joined_users_messages) # Only keep messages that were received since signing up df_joined_users_messages_features = df_joined_users_messages_features[df_joined_users_messages_features['message.date'] >= df_joined_users_messages_features['signup.date']] # Get the average message count grouped by group.id avg_message_count = df_joined_users_messages_features.groupby('group.id')['message.count'].mean() # Return the average message count grouped by user groups and rounded to 2 decimals return np.round(avg_message_count.tolist(), decimals=2)
def fill_and_remove(self, s_strategy="zeros", l_features = False, b_remove = True): ''' fill all Nan values in numerical data with zeros and then remove data points that all features are equal to zero l_features: a list of features to be tested. If any, all features will be used b_remove: boolean indicating if should remove keys where all data is 0 s_strategy: string with the strategy used to fill NaNs. Can be "mean", "median" and "zeros" ''' df = self.getData() #pre-process data if not l_features: l_features = self.payments_features + self.stock_features l_features+= self.email_features df.loc[:, l_features] = df.loc[:, l_features].astype(float) #filling Nan with the strategy selected if s_strategy == "zeros": df.loc[:, l_features] = df.loc[:, l_features].fillna(0) else: na_X = df.loc[:, l_features].values imp = Imputer(missing_values='NaN', strategy=s_strategy, axis=0) df.loc[:, l_features] = imp.fit_transform(na_X) #exclude datapoint where every number is equal to 0 if b_remove: df = df.ix[((df.loc[:, l_features]!=0).sum(axis=1)!=0),:] #saving the new dataframe self.setData(df) #correct scaled df if type(self.df_scaled)!=list: df2 = self.df_scaled df2 = df2.ix[((df.loc[:, l_features]!=0).sum(axis=1)!=0).index,:] self.df_scaled = df2
def run_importance(clf, data, labels, feature_labels=[""], string=""): """ Fit a classifier using all the data and plot the feature importances :param clf: Classifier object that has feature_importances_ member :param feature_labels: names of the features :param string: classifier name :return: (void) plot Gini importance vs feature """ num_features = data.shape[1] importances = [0]*num_features imp = Imputer(missing_values=np.NaN, strategy="mean") data = imp.fit_transform(data) # run the classifier 100 times and average the importance found after each fit for r in range(100): clf.fit(data, labels) importances = [importances[i]+clf.feature_importances_[i] for i in range(num_features)] importances = [importance/100 for importance in importances] # Filter out the features that have 0 importance (e.g. values are all 0) # non_zeros are the indices in feature_importances that are not 0 non_zeros = [i for i in range(num_features) if not importances[i] == 0] importances = [importances[i] for i in non_zeros] feature_labels = [feature_labels[i] for i in non_zeros] # Plot the features bar_width = 0.7 plt.bar(range(len(feature_labels)), importances, bar_width) plt.xticks([ind + +float(bar_width)/2 for ind in range(len(feature_labels))], feature_labels,rotation="vertical") plt.gcf().subplots_adjust(bottom=0.35) plt.xlabel("Feature") plt.ylabel("Gini Importance") plt.title("Gini Importance v. Features for "+string+" Classifier") plt.show()
def test(): vec = DictVectorizer() imp = Imputer(missing_values='NaN', strategy='mean', axis=0) for filename in glob.glob(r'../dataset/UCI/*.arff'): basename = re.sub(r'(\..*?)$','',os.path.basename(filename)) print basename if basename != DS: continue # cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb')) data = arff.loadarff(filename)[0] X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray() imp.fit(X) X = imp.transform(X) labels = np.array([row[-1] for row in data]) y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels]) random = np.random.permutation(range(len(X))) print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())])) for iteration in xrange(10): X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10) for train, test in kf: length, train_size = len(train), 0.1 X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0) for R in xrange(2,10): ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]]) # print "%s R=%d"%(basename,R), cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix) exit()
def computePearson(args): filter(args) with open(args.feature_file, 'r') as fp: features = [line for line in fp.read().splitlines() if not line.startswith('#')] X = loadtxt(TMP_DATA_FILE) y = loadtxt(TMP_LABEL_FILE) assert X.shape[0] == y.shape[0] assert X.shape[1] == len(features) imputer = Imputer(strategy='median', copy=False) X = imputer.fit_transform(X) if args.output_file: with open(args.output_file, 'w') as fp: print >> fp, '\t'.join(['feature', 'coeff', 'pvalue']) for i in range(len(features)): coeff, pvalue = pearsonr(X[:, i], y) print >> fp, '%s\t%f\t%f' % (features[i], coeff, pvalue) if args.group_output_file: groups = getGroups(features) index = {features[i]: i for i in range(len(features))} with open(args.group_output_file, 'w') as fp: print >> fp, '\t'.join(['prefix', 'feature1', 'feature2', 'coeff', 'pvalue']) for prefix, group in groups.iteritems(): for i in range(len(group)): for j in range(i+1, len(group)): coeff, pvalue = pearsonr(X[:, index[group[i]]], X[:, index[group[j]]]) print >> fp, '%s\t%s\t%s\t%f\t%f' % ( prefix, group[i], group[j], coeff, pvalue)
def gettestdata(fil) : data = np.genfromtxt(fil,delimiter=',') imp = Imputer(missing_values='NaN', strategy='median', axis=0) X = imp.fit_transform(data[:,2:]) X = scale(X).copy() #spr.eliminate_zeros() return np.array(X)
def calcEdges(data): n = len(data) usersDic = {} usersId = 0 moviesDic = {} moviesId = 0 for i in range(n): r = data[i] if r[0] not in moviesDic: moviesDic[r[0]] = moviesId moviesId += 1 if r[1] not in usersDic: usersDic[r[1]] = usersId usersId += 1 E = np.zeros((moviesId, usersId)) #E = np.full((moviesId, usersId), np.nan) for i in range(n): user = usersDic[data[i][1]] movie = moviesDic[data[i][0]] E[movie, user] = data[i][2] estimator = Imputer(0, strategy='mean') #estimator = SoftImpute() #estimator.fit(E) #E = estimator.predict(E) E = estimator.fit_transform(E) return E, usersDic, moviesDic
def get_some_data(): data = melbourne_data; y = data.Price X = data[cols_to_use] my_imputer = Imputer() imputed_X = my_imputer.fit_transform(X) return imputed_X, y
def plot_ROCList(clfList, data, labels, stringList=""): """ Plot an ROC curve for each classifier in clfList, training on a single 80/20 split :param clfList: :param data: :param labels: :param stringList: :return: """ if stringList == "": stringList = ["" for i in range(len(labels))] imp = Imputer(missing_values=np.NaN, strategy="mean") data = imp.fit_transform(data) # Cross-validate on the data once using each model to get a ROC curve AUCs, fprs, tprs, threshs = cvList(data, labels, clfList) # Plote a ROC for each clf in clfList for i in range(len(clfList)): fpr = fprs[i] tpr = tprs[i] plt.plot(fpr, tpr) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title(stringList[i]+" ROC Curve, AUC = "+str(AUCs[i])) plt.savefig(stringList[i]+"_ROC.png") plt.close() print stringList[i] + ":" + str(AUCs[i])
def bnp_svm(train, test): print('bnpsvm') ## If a value is missing, set it to the average imp = Imputer(missing_values='NaN', strategy='mean', axis=0) #print("cleaning data") train = train.sample(1000) ## set up training data train1 = train.select_dtypes(include=['float64']) imp.fit(train1) train1 = imp.transform(train1) train1 = np.array(train1).astype(float) ## set up real y target = np.array(train['target']).astype(int) ## set up testing data test1 = test.select_dtypes(include=['float64']) test1 = imp.transform(test1) test1 = np.array(test1).astype(float) #print("training...") clf = svm.SVC(gamma=0.001, C=100, probability=True) #print("testing") clf.fit(train1, target) #print("predicting") yhat = clf.predict_proba(test1) return yhat #print(bnp_svm(train, test))
def run_clfList(clfList, stringList="", normalize=False): """ Run 100-fold 80/20 cross-validation on each classifier in clfList print the average AUC for each classifier :param clfList: list of classifiers to run :param stringList: names of the classifiers :param normalize: whether or not to normalize the data :return: the average AUC for each classifier in clfList """ # data, labels = six_features(force=False) # data, labels = six_and_time_features(force=False) # data, labels = five_features(force=False) # data, labels = five_and_rts(force=False) data, labels = new_features() if normalize: data = normalize_data(data) imp = Imputer(missing_values=np.NaN, strategy="mean") data = imp.fit_transform(data) # Cross-validate all clfs 100 times means = kfoldcvList(data, labels, clfList, 100) if stringList == "": stringList = ["" for i in range(len(labels))] # Print out the mean AUCs for i, mean in enumerate(means): print stringList[i]+": "+str(mean) for mean in means: sys.stdout.write(str(mean) + " & ") sys.stdout.write("\n") return means
# Importing the libraries import numpy as np #contains mathematical tools import matplotlib.pyplot as plt #plot charts import pandas as pd #to import and manage datasets # Importing dataset dataset = pd.read_csv('Data.csv') #reading dataset # iloc -> integer-location based indexing for selection by position. X = dataset.iloc[:, : -1].values #taking all columns except last one which is output label Y = dataset.iloc[:, 3].values #taking column of output label # Taking care of missing data from sklearn.preprocessing import Imputer #for completing missing values .Select and Press Ctrl +I to see syntax imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) #Replace missing values by mean imputer = imputer.fit(X[:, 1:3]) #since index 1 and 2 contains missing columns X[:, 1:3] = imputer.transform(X[:, 1:3]) #Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder #LabelEncoder to encode values and OneHotEncoder to give dummy values labelEncoder_X = LabelEncoder() X[:, 0] = labelEncoder_X.fit_transform( X[:, 0]) #to categorize Country column gives encoded values onehotencoder = OneHotEncoder( categorical_features=[0]) # to give which column to categorize X = onehotencoder.fit_transform(X).toarray() #to categorize output label it wont need OneHotEncoder since it is dependent variable with only 2 labels Yes or No labelEncoder_Y = LabelEncoder() Y = labelEncoder_Y.fit_transform(
# Data Preprocessing Template # Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values # filling the missing values from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='median', axis=0) # 0 for along columns and 1 for along rows imputer.fit(X[:, 1:3]) # 3 is exclude (1 and 2 have missing data) X[:, 1:3] = imputer.transform(X[:, 1:3]) # missing data will be filled # Encoding categorical data (countries here) from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) # As we cant categories countries so we will create dummy variables(matrix form) onehotencoder = OneHotEncoder(categorical_features=[0]) X = onehotencoder.fit_transform(X).toarray() labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # 0 to no and y to yes
def processData(): train = pd.read_csv("train.csv") catFeatures = [] numFeatures = [] for name, val in zip(train.columns, train.dtypes): if val in [np.dtype('O'), np.dtype('int64')]: if name not in [ 'GTIME', 'GSTATUS_THREE_MONTHS', 'GSTATUS_SIX_MONTHS', 'GSTATUS_ONE_YEAR', 'GSTATUS_THREE_YEARS' ]: catFeatures.append(name) else: numFeatures.append(name) # catFeatures = ['GENDER', 'ABO', 'LIFE_SUP_TCR', 'MALIG_TCR', 'EXC_HCC', 'EXC_CASE', 'PERM_STATE', 'PREV_AB_SURG_TCR', 'BACT_PERIT_TCR', 'PORTAL_VEIN_TCR', 'TIPSS_TCR', 'WORK_INCOME_TCR', 'INIT_DIALYSIS_PRIOR_WEEK', 'INIT_MELD_OR_PELD', 'FINAL_DIALYSIS_PRIOR_WEEK', 'FINAL_MELD_OR_PELD', 'PERM_STATE_TRR', 'WORK_INCOME_TRR', 'MALIG_TRR', 'LIFE_SUP_TRR', 'PORTAL_VEIN_TRR', 'PREV_AB_SURG_TRR', 'TIPSS_TRR', 'HBV_CORE', 'HBV_SUR_ANTIGEN', 'HCV_SEROSTATUS', 'EBV_SEROSTATUS', 'HIV_SEROSTATUS', 'CMV_STATUS', 'CMV_IGG', 'CMV_IGM', 'TXLIV', 'PREV_TX', 'DDAVP_DON', 'CMV_DON', 'HEP_C_ANTI_DON', 'HBV_CORE_DON', 'HBV_SUR_ANTIGEN_DON', 'DON_TY', 'GENDER_DON', 'HOME_STATE_DON', 'NON_HRT_DON', 'ANTIHYPE_DON', 'PT_DIURETICS_DON', 'PT_STEROIDS_DON', 'PT_T3_DON', 'PT_T4_DON', 'VASODIL_DON', 'VDRL_DON', 'CLIN_INFECT_DON', 'EXTRACRANIAL_CANCER_DON', 'HIST_CIG_DON', 'HIST_COCAINE_DON', 'DIABETES_DON', 'HIST_HYPERTENS_DON', 'HIST_OTH_DRUG_DON', 'ABO_DON', 'INTRACRANIAL_CANCER_DON', 'SKIN_CANCER_DON', 'HIST_CANCER_DON', 'PT_OTH_DON', 'HEPARIN_DON', 'ARGININE_DON', 'INSULIN_DON', 'DIAL_TX', 'ABO_MAT', 'AGE_GROUP', 'MALIG', 'RECOV_OUT_US', 'TATTOOS', 'LI_BIOPSY', 'PROTEIN_URINE', 'CARDARREST_NEURO', 'INOTROP_SUPPORT_DON', 'CDC_RISK_HIV_DON', 'HISTORY_MI_DON', 'CORONARY_ANGIO_DON', 'LT_ONE_WEEK_DON'] # numFeatures = ['WGT_KG_DON_CALC', 'INIT_INR', 'ETHCAT_DON', 'ETHNICITY', 'DGN_TCR', 'REM_CD', 'INIT_AGE', 'ALBUMIN_TX', 'BMI_DON_CALC', 'EXC_EVER', 'OTH_LIFE_SUP_TCR', 'FINAL_ASCITES', 'WGT_KG_CALC', 'END_BMI_CALC', 'LISTYR', 'DDR1', 'FINAL_ALBUMIN', 'DB2', 'INIT_BMI_CALC', 'CITIZENSHIP', 'DB1', 'EDUCATION', 'DAYSWAIT_CHRON', 'OTH_LIFE_SUP_TRR', 'MED_COND_TRR', 'INIT_WGT_KG', 'MELD_PELD_LAB_SCORE', 'NUM_PREV_TX', 'INIT_SERUM_SODIUM', 'VENTILATOR_TCR', 'TX_PROCEDUR_TY', 'LITYP', 'INIT_SERUM_CREAT', 'WGT_KG_TCR', 'TBILI_DON', 'HGT_CM_CALC', 'SGOT_DON', 'ASCITES_TX', 'INIT_MELD_PELD_LAB_SCORE', 'ECD_DONOR', 'CREAT_TX', 'INIT_ENCEPH', 'INIT_HGT_CM', 'PRI_PAYMENT_TRR', 'INIT_STAT', 'ARTIFICIAL_LI_TCR', 'PT_CODE', 'WL_ID_CODE', 'INIT_ALBUMIN', 'ARTIFICIAL_LI_TRR', 'AGE_DON', 'ON_VENT_TRR', 'PRI_PAYMENT_TCR', 'BLOOD_INF_DON', 'CREAT_DON', 'REGION', 'INIT_ASCITES', 'HEMATOCRIT_DON', 'DIAB', 'TBILI_TX', 'FINAL_INR', 'AGE', 'FUNC_STAT_TRR', 'ETHCAT', 'CITIZENSHIP_DON', 'DEATH_MECH_DON', 'FUNC_STAT_TCR', 'FINAL_SERUM_SODIUM', 'COD_CAD_DON', 'FINAL_BILIRUBIN', 'BUN_DON', 'END_STAT', 'BMI_CALC', 'DDR2', 'FINAL_SERUM_CREAT', 'HIST_DIABETES_DON', 'ENCEPH_TX', 'SHARE_TY', 'DA1', 'PH_DON', 'FINAL_MELD_PELD_LAB_SCORE', 'BMI_TCR', 'INIT_BILIRUBIN', 'DISTANCE', 'SGPT_DON', 'PULM_INF_DON', 'HGT_CM_TCR', 'TRANSFUS_TERM_DON', 'FINAL_ENCEPH', 'DIAG', 'DA2', 'HGT_CM_DON_CALC', 'URINE_INF_DON', 'COLD_ISCH', 'INR_TX', 'DEATH_CIRCUM_DON', 'CANCER_SITE_DON'] #Categorical pipeline cat_pipeline = Pipeline([ ('selector', DataFrameSelector(catFeatures)), ('imputer', CategoricalImputer()), ('cat_encoder', CategoricalEncoder("onehot-dense", handle_unknown='ignore')), ]) #Numerical pipeline num_pipeline = Pipeline([ ('selector', DataFrameSelector(numFeatures)), ('imputer', Imputer(strategy="median")), ('std_scaler', StandardScaler()), ]) #Full pipeline full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) # train = pd.read_csv("train.csv") X_train = full_pipeline.fit_transform(train.loc[:, catFeatures + numFeatures]) gstatusSixMonths_train = train["GSTATUS_SIX_MONTHS"].values gstatusOneYear_train = train["GSTATUS_ONE_YEAR"].values gstatusThreeYears_train = train["GSTATUS_THREE_YEARS"].values gstatus_train = train["GSTATUS_THREE_YEARS"].values gtime_train = train["GTIME"].values Y_train = np.array([[gstatus_train[i], gtime_train[i]] for i in range(len(gtime_train)) ]) #[is_not_censored, survival time] test = pd.read_csv("test.csv") X_test = full_pipeline.transform(test.loc[:, catFeatures + numFeatures]) gstatusSixMonths_test = test["GSTATUS_SIX_MONTHS"].values gstatusOneYear_test = test["GSTATUS_ONE_YEAR"].values gstatusThreeYears_test = test["GSTATUS_THREE_YEARS"].values gstatus_test = test["GSTATUS_THREE_YEARS"].values gtime_test = test["GTIME"].values Y_test = np.array([[gstatus_test[i], gtime_test[i]] for i in range(len(gtime_test)) ]) #[is_not_censored, survival time] return X_train, Y_train, X_test, Y_test
import pandas as pd base = pd.read_csv('./datasets/credit-data.csv') #load a file base.loc[base.age < 0, 'age'] = base['age'][ base.age > 0].mean() #replace all negative ages to ages mean base.loc[pd.isnull(base['age'])] #find out on base all indexes with null age forecasts = base.iloc[:, 1: 4].values #forecasts variable will receive the columns 1,2 and 3 of base. The ":" means we want all lines. classes = base.iloc[:, 4].values #Using sklearn to localize all missing_values and replace using a strategy from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(forecasts[:, 1:4]) forecasts[:, 1:4] = imputer.transform(forecasts[:, 1:4]) #when using knn algorithms is necessary standardisation or normalization from sklearn.preprocessing import StandardScaler scaler = StandardScaler() forecasts = scaler.fit_transform(forecasts) #Divide database in training data and test data from sklearn.cross_validation import train_test_split forecasts_training, forecasts_testing, classes_training, classes_testing = train_test_split( forecasts, classes, test_size=0.25, random_state=0) from sklearn.tree import DecisionTreeClassifier classifier = DecisionTreeClassifier(criterion='entropy', random_state=0) classifier.fit(forecasts_training, classes_training)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Sep 18 20:07:04 2018 @author: diego """ import pandas as pd import numpy as np from sklearn.preprocessing import Imputer, MinMaxScaler data = pd.read_csv('../data/pacientes_ucic.csv', sep=';') imputer = Imputer() minmaxscaler = MinMaxScaler() data['SAPS-3'] = imputer.fit_transform(data[['SAPS-3']]) """ data['SAPS-3'] = minmaxscaler.fit_transform(data[['SAPS-3']]) """ print(data['SAPS-3'])
#print(corr_matrix["median_house_value"].sort_values(ascending=False)) housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head() #print(sample_incomplete_rows) sample_incomplete_rows.dropna(subset=["total_bedrooms"]) sample_incomplete_rows.drop("total_bedrooms", axis=1) median = housing["total_bedrooms"].median() sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) #print(sample_incomplete_rows) imputer = Imputer(strategy="median") housing_num = housing.drop('ocean_proximity', axis=1) imputer.fit(housing_num) #print(imputer.statistics_) #print(housing_num.median().values) X = imputer.transform(housing_num) housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=list(housing.index.values)) housing_tr.loc[sample_incomplete_rows.index.values]
import pandas as pd dataset = pd.read_csv('data1.csv') X = dataset.iloc[:, 1:11].values y = dataset['Class'] ''' # Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder = LabelEncoder() X[:, 1:11] = labelencoder.fit_transform(X[:,1:11]) onehotencoder = OneHotEncoder(categorical_features = [10]) X = onehotencoder.fit_transform(X).toarray()''' # Handling missing values from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X[:, 1:11]) X[:, 1:11] = imputer.transform(X[:, 1:11]) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test)
from sklearn.preprocessing import LabelEncoder labelencoder_X = LabelEncoder() X_tr[:, 34] = labelencoder_X.fit_transform(X_tr[:, 34].astype(str)) X_tr[:, 35] = labelencoder_X.fit_transform(X_tr[:, 35].astype(str)) X_tr[:, 68] = labelencoder_X.fit_transform(X_tr[:, 68].astype(str)) X_tr[:, 93] = labelencoder_X.fit_transform(X_tr[:, 93].astype(str)) X_ts[:, 34] = labelencoder_X.fit_transform(X_ts[:, 34].astype(str)) X_ts[:, 35] = labelencoder_X.fit_transform(X_ts[:, 35].astype(str)) X_ts[:, 68] = labelencoder_X.fit_transform(X_ts[:, 68].astype(str)) X_ts[:, 93] = labelencoder_X.fit_transform(X_ts[:, 93].astype(str)) # missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X_tr[:, :]) X_tr[:, :] = imputer.transform(X_tr[:, :]) imputer = imputer.fit(X_ts[:, :]) X_ts[:, :] = imputer.transform(X_ts[:, :]) # Encoding categorical data: OneHotEncoder from sklearn.preprocessing import OneHotEncoder a = np.concatenate((X_tr, X_ts)) onehotencoder = OneHotEncoder(categorical_features=[34, 35, 68, 93], sparse=True) a = onehotencoder.fit_transform(a).toarray() X_tr = a[:len(X_tr), :] X_ts = a[len(X_tr):, :]
import seaborn as sns sns.boxplot(x="Pclass", y="Age", hue="Survived",data=dataset, palette="Set3") ''' -------------------------------------------------------------------------------------------------- DATA PREP -------------------------------------------------------------------------------------------------- ''' # SELECTING X and y X = dataset.iloc[:,[3,4]].values # X = dataset.iloc[:,[1,3]].values # HANDLING MISSING DATA # fillnan with mean/median/most_frequent from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X.iloc[:,0].values.reshape((len(X),1))) X.iloc[:,0] = imputer.transform(X.iloc[:,0].values.reshape((len(X),1))) X.info() X.drop(0, axis=1, inplace=True) # Drop Nan X.dropna(inplace = True) y.dropa(inplace = True) # ENCODING CATEGORICAL FEATURES from sklearn.preprocessing import LabelEncoder, OneHotEncoder # The values 0,1,2,etc into categorical values labelencoder_X = LabelEncoder() X.iloc[:,1] = labelencoder_X.fit_transform(X.iloc[:, 1]) # Here we create the dummies onehotencoder = OneHotEncoder(categorical_features=[1])
import pandas as pd from sklearn import model_selection import pickle # Importing the dataset for training dataset = pd.read_csv('train.csv') new_data = dataset.iloc[:, [0, 1, 2, 4, 5, 6, 7, 9, 11]] ##encoding the training data data_dummy = pd.get_dummies(new_data) ## X_train = data_dummy.iloc[:, [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values y_train = data_dummy.iloc[:, 1].values #missing valuse addressing for training data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X_train[:, [2]]) X_train[:, [2]] = imputer.transform(X_train[:, [2]]) np.set_printoptions(threshold=np.nan) test_dataset = pd.read_csv('test.csv') test_verify = pd.read_csv('gender_submission.csv') new_test_data = test_dataset.iloc[:, [0, 1, 3, 4, 5, 6, 8, 10]] ##encoding the training data test_data_dummy = pd.get_dummies(new_test_data) ## X_test = test_data_dummy.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]].values y_test = test_verify.iloc[:, [1]].values #missing valuse addressing for training data from sklearn.preprocessing import Imputer
#test_X = test_data.drop(['Id'], axis=1) low_cardinality_cols = [ cname for cname in train_X.columns if train_X[cname].nunique() < 10 and train_X[cname].dtype == "object" ] numeric_cols = [ cname for cname in train_X.columns if train_X[cname].dtype in ['int64', 'float64'] ] my_cols = low_cardinality_cols + numeric_cols train_predictors = train_X[my_cols] #test_predictors = test_X[my_cols] #print(train_predictors.shape) one_hot_encoded_training_predictors = pd.get_dummies(train_predictors) #one_hot_encoded_test_predictors = pd.get_dummies(test_predictors) #print(*one_hot_encoded_training_predictors.columns, sep=',') #my_submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': rf_val_predictions}) #one_hot_encoded_training_predictors.to_csv('TRAINING_1.csv', index=False) #final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors, # join='left', # axis=1) my_pipeline = make_pipeline(Imputer(), RandomForestRegressor()) #my_pipeline.fit(final_train, y) #print(my_pipeline.predict(final_test)) scores = cross_val_score(my_pipeline, one_hot_encoded_training_predictors, y, scoring='neg_mean_absolute_error') print('Mean Absolute Error %2f' % (-1 * scores.mean()))
import pandas as pd base = pd.read_csv('credit-data.csv') base.loc[base.age < 0, 'age'] = 40.92 previsores = base.iloc[:, 1:4].values classe = base.iloc[:, 4].values from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(previsores[:, 1:4]) previsores[:, 1:4] = imputer.transform(previsores[:, 1:4]) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() previsores = scaler.fit_transform(previsores) from sklearn.model_selection import train_test_split previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split( previsores, classe, test_size=0.25, random_state=0) from sklearn.neural_network import MLPClassifier classificador = MLPClassifier(verbose=True, max_iter=1000, tol=0.0000010, solver='adam', hidden_layer_sizes=(100), activation='relu') classificador.fit(previsores_treinamento, classe_treinamento) previsoes = classificador.predict(previsores_teste)
import numpy as np # import data and labels using python array handling package numpy data = np.loadtxt( "/Users/071cht/Desktop/programming_language_tutorial/Python/scikit/Ye_thesis/data_scikit/data.txt", delimiter=',') labels = np.loadtxt( "/Users/071cht/Desktop/programming_language_tutorial/Python/scikit/Ye_thesis/data_scikit/labels.txt" ) intLabels = labels.astype(int) # scikit has a imputer class which provide basic strategies for imputing missing values, using mean/median/or the most frequent values of the row or column # in which the missing values are located. #the following code retures an np.array, data_nonmissing. It is a non-missing value version of data from sklearn.preprocessing import Imputer imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(data) data_nonmissing = imp.transform(data) #rename data and intLables to chase_X and chase_y for model fitting convenience X = data_nonmissing y = intLabels #set training and testing? data for 7-fold cross validation seperateIdx = len(X) * 60 / 100 X_train = X[0:seperateIdx] y_train = y[0:seperateIdx] X_test = X[seperateIdx:] y_test = y[seperateIdx:] print X_train.shape, X_test.shape
# Data Preprocessing # Importing the Libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Data.csv') x = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values # Take care of missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(x[:, 1:3]) x[:, 1:3] = imputer.transform(x[:, 1:3]) # Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_x = LabelEncoder() x[:, 0] = labelencoder_x.fit_transform(x[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) x = onehotencoder.fit_transform(x).toarray() labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split
if(np.isnan(df_test['Age'][i])): if(df_test['Title'][i] == 'Mr'): df_test['Age'][i] = 25 if(df_test['Title'][i] == 'Mrs'): df_test['Age'][i] = 25 if(df_test['Title'][i] == 'Miss'): df_test['Age'][i] = 5 if(df_test['Title'][i] == 'Master'): df_test['Age'][i] = 5 # In[ ]: #After removing these features, it's time to fill the missing values imputer = Imputer(missing_values = np.nan, strategy = 'median', axis = 0) df_train[['Age']] = imputer.fit_transform(df_train[['Age']]) df_test[['Age']] = imputer.fit_transform(df_test[['Age']]) df_train.loc[ df_train['Age'] <= 16, 'Age'] = 0 df_train.loc[(df_train['Age'] > 16) & (df_train['Age'] <= 32), 'Age'] = 1 df_train.loc[(df_train['Age'] > 32) & (df_train['Age'] <= 48), 'Age'] = 2 df_train.loc[(df_train['Age'] > 48) & (df_train['Age'] <= 64), 'Age'] = 3 df_train.loc[ df_train['Age'] > 64, 'Age'] = 4 df_test.loc[ df_test['Age'] <= 16, 'Age'] = 0 df_test.loc[(df_test['Age'] > 16) & (df_test['Age'] <= 32), 'Age'] = 1 df_test.loc[(df_test['Age'] > 32) & (df_test['Age'] <= 48), 'Age'] = 2 df_test.loc[(df_test['Age'] > 48) & (df_test['Age'] <= 64), 'Age'] = 3 df_test.loc[ df_test['Age'] > 64, 'Age'] = 4
#coding:utf8 import numpy as np import pandas as pd from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder, StandardScaler from sklearn.model_selection import train_test_split df = pd.read_csv("./resource/1.csv") X = df.iloc[:, :-1].values Y = df.iloc[:, -1].values # dealing with missing data imputer = Imputer(missing_values="NaN", strategy="mean") imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) # print(X) # deal with class y data encoder_X = LabelEncoder() X[:, 0] = encoder_X.fit_transform(X[:, 0]) # print(X) one_hot_encoder = OneHotEncoder(categorical_features=[0]) X = one_hot_encoder.fit_transform(X).toarray() # print(one_hot_encoder.n_values_) # print(one_hot_encoder.feature_indices_) # print(X)
import seaborn as sns #Importing train data train = pd.read_csv("titanic_train.csv") #Splitting the train dataset train.drop('Cabin', axis=1, inplace=True) x_train = train.drop('Survived', axis=1) y_train = train['Survived'] #Visualising train data for null values sns.heatmap(x_train.isnull()) #Filling the missing values of Age column in train dataset with its mean. from sklearn.preprocessing import Imputer imputer_train = Imputer(missing_values="NaN", strategy="mean", axis=0) imputer_train = imputer_train.fit(x_train['Age'].values.reshape(-1, 1)) x_train['Age'] = imputer_train.transform(x_train['Age'].values.reshape(-1, 1)) #Visualising train data for null values sns.heatmap(x_train.isnull()) #Importing test data test = pd.read_csv('titanic_test.csv') x_test = train.drop('Survived', axis=1) y_test = train['Survived'] #Visualising test data for null values sns.heatmap(x_test.isnull()) #Filling the missing values of Age column in test dataset with its mean.
import pandas as pd import numpy as np data = pd.read_excel("/home/karthik/Cauvery Study/dataset_rainfall_whole.xlsx") X = data.iloc[:, :-1].values y = data.iloc[:, -1].values from sklearn.preprocessing import Imputer imputer = Imputer(missing_values=0, strategy='most_frequent', axis=0) imputer = imputer.fit(X[:, [5, 6]]) X[:, [5, 6]] = imputer.transform(X[:, [5, 6]]) X1 = data.iloc[:, [0, 1, 2, 3, 4, 5, 6, 8]] #imputer = imputer.fit(y[:,0:1]) #y[:,0:1] = imputer.transform(y[:,0:1]) #from sklearn.model_selection import train_test_split #X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1) from sklearn.ensemble import RandomForestRegressor regressor = RandomForestRegressor(n_estimators=10) regressor.fit(X, y) from tkinter import * master = Tk() y_pred = regressor.predict(X1).astype(int)
def imputer(X): #fill in empty values imp = Imputer(missing_values="NaN", strategy="most_frequent", axis=0) imp = imp.fit(X[:, [1, 2, 3, -1]]) X[:, [1, 2, 3, -1]] = imp.transform(X[:, [1, 2, 3, -1]]) return X
from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler train_data = pd.read_csv('train_data/data.csv', delimiter=',') x = train_data[train_data.keys()[:-1]].values y = train_data['result'].values result_encoder = LabelEncoder() result_encoder.fit(y) y = result_encoder.transform(y) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # Data normalization imputer = Imputer(strategy='mean') imputer.fit(x_train) X_train = imputer.transform(x_train) X_test = imputer.transform(x_test) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) model = Sequential([ Dense(26, input_dim=x.shape[1]), Activation('relu'), Dense(1), Activation('sigmoid') ])
import numpy as np import matplotlib.pyplot as plt import pandas as pd np.set_printoptions(threshold=np.nan) dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values Y = dataset.iloc[:, 3].values from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder imputer = Imputer(missing_values='NaN', strategy="mean", axis=0) imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) labelEncoder = LabelEncoder() X[:, 0] = labelEncoder.fit_transform(X[:, 0]) oneHotEncoder_X = OneHotEncoder(categorical_features=[0]) X = oneHotEncoder_X.fit_transform(X).toarray() labelEncoder_Y = LabelEncoder() Y = labelEncoder_Y.fit_transform(Y)
dfg.sum() df[df['R6'] > 1] df[df.R6 > 1] = df.R6.mean() df[df.R5 > 1] = df.R5.mean() df[df.R19 > 1] = df.R19.mean() df.max() df.columns X = df.drop(['object'], axis=1) Y = df['object'] Y.iloc[12] = 'R' Y.iloc[19] = 'R' Y.iloc[200] = 'M' #Removing NA Values #Take care of Missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values="NaN", strategy='mean', axis=0) imputer = imputer.fit(X) X = imputer.transform(X) #Converting Categorical data #Encoding categorical data using dummy variables for X from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_Y = LabelEncoder() Y = labelencoder_Y.fit_transform(Y) onehotencoder = OneHotEncoder(categorical_features=[0]) Y = onehotencoder.fit_transform(Y).toarray() from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,
} # Replacing Categorical Values with the Encoded Values for col in col_names: if df[col].dtype == 'O': df[col] = df[col].replace(enc[col]) # Creating Separate Dataframes for Features and Class X = df.iloc[:, :-1].values y = df.iloc[:, 12].values # Removing Loan_ID Column from the Dataset X = np.delete(X, 0, 1) # Creating Instances of Imputer Class for Missing Value Management imputer_mode = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imputer_mean = Imputer(missing_values='NaN', strategy='mean', axis=0) # Replacing 'NaN' Values with Mode of the Values in the Respective Columns X[:7] = imputer_mode.fit_transform(X[:7]) X[9:] = imputer_mode.fit_transform(X[9:]) # Replacing 'NaN' Values present in "LoanAmount" Column with the Mean of the Values of that Column X_temp = X[:, 7].reshape(-1, 1) X_temp = imputer_mean.fit_transform(X_temp) X_temp = X_temp.reshape(1, -1) X = np.delete(X, 7, 1) X = np.insert(X, 7, X_temp, axis=1) #----------------------------------------Data Preprocessing and Data Cleaning----------------------------------------
action='store_true', help='Whether to use scikit data balancing by changing penalties ' 'in learning algorithm formulation or manually balance by ' 'undersampling majority class and oversampling minority class') args = parser.parse_args() if __name__ == "__main__": # Let numpy know that NA corresponds to our missing value data = numpy.genfromtxt(args.input_filename, delimiter=",", skip_header=1, missing_values="NA", filling_values="NaN") # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(data) data = imputer.transform(data) features = data[:, 0:args.label_column] labels = data[:, args.label_column].astype(int) train_features, test_features, train_labels, test_labels = ( model_selection.train_test_split(features, labels, test_size=0.20)) # scale data only if model is linear (svm, logisitic regression) or scales of features # are relevant (knn) if args.algorithm in ['linear-svm', 'kernel-svm', 'logistic', 'knn']: (train_features, test_features) = utils.scale_data(train_features, test_features, 'minmax')
def __init__(self): self.reg = Pipeline([ ('imputer', Imputer(strategy='median')), ('regressor', RandomForestRegressor(n_estimators = 500, max_features=0.5, min_samples_leaf = 5)) ])
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import Imputer, Normalizer from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) imputer = Imputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) # Score on the training set was:0.9348076424295388 exported_pipeline = make_pipeline( StackingEstimator( estimator=GradientBoostingClassifier(learning_rate=0.01, max_depth=4, max_features=0.05, min_samples_leaf=10, min_samples_split=19, n_estimators=100, subsample=0.9000000000000001)), Normalizer(norm="max"),
@author: Ashlin """ import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.pipeline import Pipeline mydata = pd.read_csv('Data.csv') print mydata.head() print mydata.iloc[:, 0] print mydata.head() X = mydata.iloc[:, 0:3].values y = mydata.iloc[:, 3].values from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) X[:, 1:3] = imputer.fit_transform(X[:, 1:3]) print X from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder = LabelEncoder() X[:, 0] = labelencoder.fit_transform(X[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) X = onehotencoder.fit_transform(X).toarray() print X y = labelencoder.fit_transform(y) print y print X[:, 2] from sklearn.model_selection import train_test_split
def predict_catkit_demo(images): """Return a prediction of adsorption energies for structures generated with CatKitDemo. Parameters ---------- images : list List of atoms objects representing adsorbate-surface structures. model : str Path and filename of Catlearn model pickle. """ model_ref = {'H': 'H2', 'O': 'H2O, H2', 'C': 'CH4, H2'} # Make list of strings showing the references. display_ref = [] for atoms in images: try: initial_state = [model_ref[s] for s in ase.atoms.string2symbols( atoms.info['key_value_pairs']['species'])] except KeyError: return {} display_ref.append( '*, ' + ', '.join(list(np.unique(initial_state)))) images = autogen_info(images) gen = FeatureGenerator(nprocs=1) train_fpv = default_fingerprinters(gen, 'adsorbates') train_fpv = [gen.mean_chemisorbed_atoms, gen.count_chemisorbed_fragment, gen.count_ads_atoms, gen.count_ads_bonds, gen.ads_av, gen.ads_sum, gen.bulk, gen.term, gen.strain, gen.mean_surf_ligands, gen.mean_site, gen.median_site, gen.max_site, gen.min_site, gen.sum_site, gen.generalized_cn, gen.en_difference_ads, gen.en_difference_chemi, gen.en_difference_active, gen.db_size, gen.delta_energy] matrix = gen.return_vec(images, train_fpv) feature_index = np.load(clean_index_name) clean_feature_mean = np.load(clean_mean) impute = Imputer(missing_values="NaN", strategy='mean') impute.statistics_ = clean_feature_mean new_data = impute.transform(matrix[:, feature_index]) prediction = gp.predict(new_data, get_validation_error=False, get_training_error=False, uncertainty=True) output = {'mean': list(prediction['prediction']), 'uncertainty': list(prediction['uncertainty']), 'references': display_ref} return output
import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import Imputer import xgboost as xgb from ZK import ZKTools # -------------------- print('Loading data...') path = '/Users/Bing/Documents/DS/Zillow_Kaggle/' df_train = pd.read_csv('train_features.csv') df_target = pd.read_csv('train_target.csv').values.ravel() imp = Imputer() df_train_imp = pd.DataFrame(imp.fit_transform(df_train), columns=df_train.columns) n_estimators = 1 random_state = 0 # RF rf_params = { 'n_jobs': -1, 'n_estimators': n_estimators, 'warm_start': True, 'max_depth': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'verbose': 0 } rf = RandomForestRegressor(random_state=random_state, **rf_params) rf_CV = ZKTools.CV(df_train=df_train_imp,
class NetworkClassifer(): def __init__(self, features, labels, validation_features, validation_labels): self.features = features self.feature_labels = [ 'min', 'max', 'mean', 'skew', 'std', 'kurtosis', 'sum of absolute difference', 'baseline_n', 'baseline_diff', 'baseline_diff_skew', 'n_pks', 'n_vals', 'av_pk', 'av_val', 'av pk val range', '1 hz', '5 hz', '10 hz', '15 hz', '20 hz', '30 hz', '60 hz', '90 hz' ] self.labels = np.ravel(labels) self.validation_features = validation_features self.validation_labels = np.ravel(validation_labels) self.impute_and_scale() def impute_and_scale(self): print('Scaling and imputing training dataset...') self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) self.imputer.fit(self.features) imputed_features = self.imputer.transform(self.features) self.std_scaler = StandardScaler() self.std_scaler.fit(imputed_features) self.iss_features = self.std_scaler.transform(imputed_features) print('Done') print( 'Scaling and imputing validation features using training dataset...' ) imputed_validation_features = self.imputer.transform( self.validation_features) self.iss_validation_features = self.std_scaler.transform( imputed_validation_features) print('Done') def _cross_validation(self, clf, k_folds=5): self.scores = cross_validation.cross_val_score(clf, self.iss_features, self.labels, cv=k_folds, n_jobs=5, scoring='roc_auc') def randomforest_info(self, max_trees=1000, step=40, k_folds=5): print('Characterising R_forest. Looping through trees: ') self.treedata = np.zeros((max_trees / step, 10)) for i, n_trees in enumerate(np.arange(0, max_trees, step)): if n_trees == 0: n_trees = 1 r_forest = RandomForestClassifier( n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0, ) scores = cross_validation.cross_val_score(r_forest, self.iss_features, self.labels, cv=k_folds, n_jobs=5) r_forest_full = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) r_forest_full.fit(self.iss_features, self.labels) self.treedata[i, 0] = n_trees self.treedata[i, 1] = scores.mean() self.treedata[i, 2] = scores.std() # now add the test dataset - score self.treedata[i, 3] = r_forest_full.score( self.iss_validation_features, self.validation_labels) r_forest_lda = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) r_forest_lda_full = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) r_forest_lda_full.fit(self.lda_iss_features, self.labels) lda_scores = cross_validation.cross_val_score( r_forest_lda, self.lda_iss_features, self.labels, cv=k_folds, n_jobs=5) self.treedata[i, 4] = lda_scores.mean() self.treedata[i, 5] = lda_scores.std() self.treedata[i, 6] = r_forest_lda_full.score( self.lda_iss_validation_features, self.validation_labels) r_forest_pca = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) r_forest_pca_full = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) r_forest_pca_full.fit(self.pca_iss_features, self.labels) pca_scores = cross_validation.cross_val_score( r_forest_pca, self.pca_iss_features, self.labels, cv=k_folds, n_jobs=5) self.treedata[i, 7] = pca_scores.mean() self.treedata[i, 8] = pca_scores.std() self.treedata[i, 9] = r_forest_pca_full.score( self.pca_iss_validation_features, self.validation_labels) def pca(self, n_components=6): self.pca = PCA(n_components) self.pca_iss_features = self.pca.fit_transform(self.iss_features) self.pca_iss_validation_features = self.pca.transform( self.iss_validation_features) def lda(self, n_components=2, pca_reg=True, reg_dimensions=10): self.lda = LinearDiscriminantAnalysis(n_components=n_components, solver='eigen', shrinkage='auto') #self.lda = LDA(n_components) if pca_reg: self.pca_reg = PCA(reg_dimensions) pca_reg_features = self.pca_reg.fit_transform(self.iss_features) self.lda_iss_features = self.lda.fit_transform( pca_reg_features, self.labels) pca_reg_validation_features = self.pca_reg.transform( self.iss_validation_features) self.lda_iss_validation_features = self.lda.transform( pca_reg_validation_features) else: self.lda_iss_features = self.lda.fit_transform( self.iss_features, self.labels) self.lda_iss_validation_features = self.lda.transform( self.iss_validation_features) def lda_run(self, k_folds=5): self.r_forest_lda = RandomForestClassifier(n_estimators=2000, n_jobs=5, max_depth=None, min_samples_split=2, random_state=7, max_leaf_nodes=None, min_samples_leaf=2, criterion='gini', max_features='sqrt', class_weight='balanced') self.lda_scores = cross_validation.cross_val_score( self.r_forest_lda, self.lda_iss_features, self.labels, cv=k_folds, n_jobs=5) print( "Cross validation Random Forest performance LDA: Accuracy: %0.2f (std %0.2f)" % (self.lda_scores.mean() * 100, self.lda_scores.std() * 100)) self.r_forest_lda.fit(self.lda_iss_features, self.labels) print( str( self.r_forest_lda.score(self.lda_iss_validation_features, self.validation_labels) * 100) + 'LDA test-set performance') y_true = self.validation_labels y_pred = self.r_forest_lda.predict(self.lda_iss_validation_features) target_names = ['S1', 'S2', 'S3', 'S4'] report = classification_report(y_true, y_pred, target_names=target_names) print('Random forest report lda') print(report) ##### Hacky way to export features, so can optimise RF etc ###### train_X = pd.DataFrame(self.lda_iss_features) train_y = pd.DataFrame(self.labels) training = pd.concat([train_X, train_y], axis=1) training.to_csv( '/Users/Jonathan/Dropbox/Data_sharing_VMJC/training_lda.csv', index=False) test_X = pd.DataFrame(self.lda_iss_validation_features) test_y = pd.DataFrame(self.validation_labels) test = pd.concat([test_X, test_y], axis=1) test.to_csv('/Users/Jonathan/Dropbox/Data_sharing_VMJC/test_lda.csv', index=False) train_X = pd.DataFrame(self.iss_features) train_y = pd.DataFrame(self.labels) training = pd.concat([train_X, train_y], axis=1) training.to_csv( '/Users/Jonathan/Dropbox/Data_sharing_VMJC/training.csv', index=False) test_X = pd.DataFrame(self.iss_validation_features) test_y = pd.DataFrame(self.validation_labels) test = pd.concat([test_X, test_y], axis=1) test.to_csv('/Users/Jonathan/Dropbox/Data_sharing_VMJC/test.csv', index=False) def pca_run(self, k_folds=5): self.r_forest_pca = RandomForestClassifier(n_estimators=2000, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) self.pca_scores = cross_validation.cross_val_score( self.r_forest_pca, self.pca_iss_features, self.labels, cv=k_folds, n_jobs=5) print( "Cross validation RF performance PCA: Accuracy: %0.2f (std %0.2f)" % (self.pca_scores.mean() * 100, self.pca_scores.std() * 100)) self.r_forest_pca.fit(self.pca_iss_features, self.labels) print( str( self.r_forest_pca.score(self.pca_iss_validation_features, self.validation_labels)) + 'PCA test-set performance ') def run(self): r_forest = RandomForestClassifier(n_estimators=2000, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0, class_weight='balanced') self._cross_validation(r_forest) print("Cross validation RF performance: Accuracy: %0.2f (std %0.2f)" % (self.scores.mean() * 100, self.scores.std() * 100)) self.r_forest = RandomForestClassifier(n_estimators=2000, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0, class_weight='balanced') self.r_forest.fit(self.iss_features, self.labels) print( str( self.r_forest.score(self.iss_validation_features, self.validation_labels)) + 'randomforest test-set performance') y_true = self.validation_labels y_pred = self.r_forest.predict(self.iss_validation_features) target_names = ['inter-ictal', 'ictal'] target_names = ['S1', 'S2', 'S3', 'S4'] t = classification_report(y_true, y_pred, target_names=target_names) print('Random forest report:') print(t) return None