def remove_outliers(image,mask): #taking the mask part to image to check the presence of bee im = cv2.bitwise_and(image,image,mask=mask); ldp_image,_,_ = ldp.ldp(im); test_Y = ldp_image.reshape((ldp_image.shape[0] * ldp_image.shape[1], ldp_image.shape[2])); test_rgb = im.reshape((im.shape[0] * im.shape[1], im.shape[2])); test = np.concatenate((test_Y,test_rgb),axis=1); mask_not = cv2.bitwise_not(mask); ret1, mask_not = cv2.threshold (mask_not,np.mean(mask_not), 255, cv2.THRESH_BINARY); im = cv2.bitwise_and(image,image,mask=mask_not); ldp_image,_,_ = ldp.ldp(im); data_ldp = ldp_image.reshape((ldp_image.shape[0] * ldp_image.shape[1], ldp_image.shape[2])); data_rgb = im.reshape((im.shape[0] * im.shape[1], im.shape[2])); data = np.concatenate((data_rgb,data_ldp),axis=1); data = data[np.any(data!=0,axis=1)]; print data.shape; data = data.astype('float64'); data = preprocessing.normalize(data,axis=0); ss = StandardScaler(); data = ss.fit_transform(data); clf = svm.OneClassSVM(nu=0.8, kernel="rbf", gamma=0.1) clf.fit(data); test = test.astype('float64'); test = preprocessing.normalize(test,axis=0); print test.shape; test = ss.fit_transform(test); test = clf.predict(test); test = test.reshape((image.shape[0] , image.shape[1])); test[test==-1] = 0; test[test==1] = 255; test = test.astype('uint8'); im = cv2.bitwise_and(image,image,mask=test); im = cv2.bitwise_and(im,im,mask=mask); #print test[:,0],test[:,1]; return(im,test);
class LinearXGB(ClippedMixin): trained = set() cache = {} def __init__(self, params, num_rounds): self.params = params self.scaler = StandardScaler(with_mean=False) self.num_rounds = num_rounds def fit(self, dense, svd, sparse, y): X_train = np.hstack((dense, svd)) #X_train = hstack((X_train, sparse)) train_hash = hash(str(X_train)) if train_hash not in self.trained: X_scaled = self.scaler.fit_transform(X_train) X_scaled = normalize(X_scaled) dtrain = xgb.DMatrix(X_scaled, label=y) watchlist = [(dtrain, 'train')] self.bst = xgb.train(self.params, dtrain, self.num_rounds)#, watchlist) self.trained.add(train_hash) def predict(self, dense, svd, sparse): X_test = np.hstack((dense, svd)) #X_test = hstack((X_test, sparse)) test_hash = hash(str(X_test)) if test_hash not in self.cache: #X_scaled = X_test X_scaled = self.scaler.fit_transform(X_test) X_scaled = normalize(X_scaled) dtest = xgb.DMatrix(X_scaled) #dtest = xgb.DMatrix(X_test) y_pred = self.bst.predict(dtest) self.cache[test_hash] = y_pred return self.cache[test_hash]
def DBScan_Flux(phots, ycenters, xcenters, dbsClean=0, useTheForce=False): """Class methods are similar to regular functions. Note: Do not include the `self` parameter in the ``Args`` section. Args: param1: The first parameter. param2: The second parameter. Returns: True if successful, False otherwise. """ dbsPhots = DBSCAN()#n_jobs=-1) stdScaler = StandardScaler() phots = np.copy(phots.ravel()) phots[~np.isfinite(phots)] = np.median(phots[np.isfinite(phots)]) featuresNow = np.transpose([stdScaler.fit_transform(ycenters[:,None]).ravel(), \ stdScaler.fit_transform(xcenters[:,None]).ravel(), \ stdScaler.fit_transform(phots[:,None]).ravel() ] ) # print(featuresNow.shape) dbsPhotsPred= dbsPhots.fit_predict(featuresNow) return dbsPhotsPred == dbsClean
def prep_X_y(df, constant=False, split=True): cols_to_exclude = ['venue_state', 'venue_name', 'venue_country', 'venue_address', 'ticket_types', 'email_domain', 'description', 'previous_payouts', 'payee_name', 'org_name', 'org_desc', 'object_id', 'name', 'acct_type', 'country', 'listed', 'currency', 'payout_type', 'channels'] if constant: df['const'] = 1 X = df.drop(cols_to_exclude + ['fraud'], axis=1).values y = df['fraud'].values print 'columns used:\n', df.drop(cols_to_exclude + ['fraud'], axis=1).columns if split: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) X_smoted, y_smoted = smote(X_train, y_train, target=.5) return X_smoted, X_test, y_smoted, y_test else: scaler = StandardScaler() X = scaler.fit_transform(X) X_smoted, y_smoted = smote(X, y, target=.5) return X_smoted, y_smoted
def logisticRegression(): data = loadtxtAndcsv_data("data1.txt", ",", np.float64) X = data[:,0:-1] y = data[:,-1] # 划分为训练集和测试集 x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2) # 归一化 scaler = StandardScaler() # scaler.fit(x_train) x_train = scaler.fit_transform(x_train) x_test = scaler.fit_transform(x_test) # 逻辑回归 model = LogisticRegression() model.fit(x_train,y_train) # 预测 predict = model.predict(x_test) right = sum(predict == y_test) predict = np.hstack((predict.reshape(-1,1),y_test.reshape(-1,1))) # 将预测值和真实值放在一块,好观察 print(predict) print('测试集准确率:%f%%'%(right*100.0/predict.shape[0])) # 计算在测试集上的准确度
def test_same_fit_transform(self): X, X_rdd = self.make_dense_rdd() local = StandardScaler() dist = SparkStandardScaler() X_trans = local.fit_transform(X) X_rdd_trans = dist.fit_transform(X_rdd).toarray() X_converted = dist.to_scikit().transform(X) assert_array_almost_equal(X_trans, X_rdd_trans) assert_array_almost_equal(X_trans, X_converted) local = StandardScaler(with_mean=False) dist = SparkStandardScaler(with_mean=False) X_trans = local.fit_transform(X) X_rdd_trans = dist.fit_transform(X_rdd).toarray() X_converted = dist.to_scikit().transform(X) assert_array_almost_equal(X_trans, X_rdd_trans) assert_array_almost_equal(X_trans, X_converted) local = StandardScaler(with_std=False) dist = SparkStandardScaler(with_std=False) X_trans = local.fit_transform(X) X_rdd_trans = dist.fit_transform(X_rdd).toarray() X_converted = dist.to_scikit().transform(X) assert_array_almost_equal(X_trans, X_rdd_trans) assert_array_almost_equal(X_trans, X_converted)
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False): """Generate a regression dataset with the given parameters.""" if verbose: print("generating dataset...") X, y, coef = make_regression(n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True) random_seed = 13 X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=n_train, random_state=random_seed) X_train, y_train = shuffle(X_train, y_train, random_state=random_seed) X_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) X_test = X_scaler.transform(X_test) y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train[:, None])[:, 0] y_test = y_scaler.transform(y_test[:, None])[:, 0] gc.collect() if verbose: print("ok") return X_train, y_train, X_test, y_test
class TrainValidSplitter(object): def __init__(self, standardize=True, few=False): self.standardize = standardize self.few = few self.standa = None def __call__(self, X, y, net): strati = StratifiedShuffleSplit(y = y, n_iter = 1, test_size = 0.2, random_state = 1234) train_indices, valid_indices = next(iter(strati)) if self.standardize: self.standa = StandardScaler() if self.few: X_train = np.hstack((self.standa.fit_transform(X[train_indices,:23]), X[train_indices,23:])) X_valid = np.hstack((self.standa.transform(X[valid_indices,:23]), X[valid_indices,23:])) else: X_train = self.standa.fit_transform(X[train_indices]) X_valid = self.standa.transform(X[valid_indices]) else: X_train, X_valid = X[train_indices], X[valid_indices] y_train, y_valid = y[train_indices], y[valid_indices] return X_train, X_valid, y_train, y_valid
def _transform_data(): from solaris.run import load_data from solaris.models import LocalModel data = load_data() X = data['X_train'] y = data['y_train'] # no shuffle - past-future split offset = X.shape[0] * 0.5 X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] print('_' * 80) print('transforming data') print tf = LocalModel(None) print('transforming train') X_train, y_train = tf.transform(X_train, y_train) print('transforming test') X_test, y_test = tf.transform(X_test, y_test) print('fin') scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) scaler = StandardScaler() y_train = scaler.fit_transform(y_train) y_test = scaler.transform(y_test) data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test} joblib.dump(data, 'data/dbndata.pkl')
def batchScaling(in_root="raw", out_root="data", with_mean=True, with_std=True): Xy_files = filter(lambda x:x.endswith(".Xy.npz"), os.listdir(in_root)) # Xy_files = ["image_rgb_gist.Xy.npz"] for Xy_file in Xy_files: in_path = os.path.join( in_root, Xy_file ) out_path = os.path.join( out_root, Xy_file ) print '> load %s' % ( in_path ) data = np.load( in_path ) ## detect sparse or dense _sparse = True if len(data['X'].shape) == 0 else False print '> scaling' if _sparse: ## Cannot center sparse matrices: pass `with_mean=False` instead. print '>> Sparse matrix detected. Use with_mean=False' scaler = StandardScaler(with_mean=False, with_std=with_std) X = scaler.fit_transform( data['X'].all() ) else: scaler = StandardScaler(with_mean=with_mean, with_std=with_std) X = scaler.fit_transform( data['X'] ) print '> compressing and dumping to %s' % (out_path) np.savez_compressed(out_path, X=X, y=data['y']) print '='*50
def data_fr(novel_num): #if csv_file(novel, novel_num) is True: nn = str(novel_num) df_novel = pd.read_csv('novel_'+nn+'list_1.csv', header=None) try: df_novel['wrd_length'] = df_novel[0].apply(wrd_lengths) df_novel['total_char'] = [sum(l) for l in df_novel['wrd_length']] df_novel['syl_count'] = df_novel[0].apply(syl_count) df_novel['syl_sum'] = [sum(l) for l in df_novel['syl_count']] df_novel['sentiment'] = df_novel[0].apply(detect_sentiment) #create csv for word to syl to improve syl function d = {} for l in df_novel[0]: sent = TextBlob(l) for x in sent.words: w = CountSyllables(x) d[x] = w with open('novel_'+nn+'list_1_syl.csv', 'wb') as f: writer = csv.writer(f) for row in d.iteritems(): writer.writerow(row) #create cluster columns df_cluster = df_novel.drop('wrd_length', 1) df_cluster = df_cluster.drop('syl_count', 1) X = df_cluster.drop(0, axis = 1) scaler = StandardScaler() X_scaled = scaler.fit_transform(X) km = KMeans(n_clusters=20, random_state=1) km.fit(X_scaled) df_cluster_20 = df_cluster.copy() df_cluster_20['cluster'] = km.labels_ df_novel['cluster_20'] = df_cluster_20['cluster'] #Create cluster 3 df_cluster_3 = df_cluster.copy() X = df_cluster_3.drop(0, axis=1) X_scaled = scaler.fit_transform(X) km = KMeans(n_clusters = 3, random_state=1) km.fit(X_scaled) df_cluster_3['cluster'] = km.labels_ df_novel['cluster_3_syl'] = df_cluster_3['cluster'] #create cluster 3 no syl df_cluster_3no_syl = df_cluster.copy() X = df_cluster_3no_syl.drop(0, axis=1) X_scaled = scaler.fit_transform(X) km = KMeans(n_clusters=3, random_state=1) km.fit(X_scaled) df_cluster_3no_syl['cluster'] = km.labels_ df_novel['cluster_3no_syl'] = df_cluster_3no_syl['cluster'] #Create 5 clusters df_cluster_5 = df_cluster.copy() X = df_cluster_5.drop(0, axis=1) X_scaled = scaler.fit_transform(X) km = KMeans(n_clusters=5, random_state=1) km.fit(X_scaled) df_cluster_5['cluster'] = km.labels_ df_novel['cluster_5'] = df_cluster_5['cluster'] df_novel.to_csv('novel_'+nn+'list_1.csv', index=False) except: rejects_3.append(novel_num)
def correlation_matching(I_tr, T_tr, I_te, T_te, n_comps): """ Learns correlation matching (CM) over I_tr and T_tr and applies it to I_tr, T_tr, I_te, T_te Parameters ---------- I_tr: np.ndarray [shape=(n_tr, d_I)] image data matrix for training T_tr: np.ndarray [shape=(n_tr, d_T)] text data matrix for training I_te: np.ndarray [shape=(n_te, d_I)] image data matrix for testing T_te: np.ndarray [shape=(n_te, d_T)] text data matrix for testing n_comps: int > 0 [scalar] number of canonical componens to use Returns ------- I_tr_cca : np.ndarray [shape=(n_tr, n_comps)] image data matrix represetned in correlation space T_tr_cca : np.ndarray [shape=(n_tr, n_comps)] text data matrix represetned in correlation space I_te_cca : np.ndarray [shape=(n_te, n_comps)] image data matrix represetned in correlation space T_te_cca : np.ndarray [shape=(n_te, n_comps)] text data matrix represetned in correlation space """ # sclale image and text data I_scaler = StandardScaler() I_tr = I_scaler.fit_transform(I_tr) I_te = I_scaler.transform(I_te) T_scaler = StandardScaler() T_tr = T_scaler.fit_transform(T_tr) T_te = T_scaler.transform(T_te) cca = PLSCanonical(n_components=n_comps, scale=False) cca.fit(I_tr, T_tr) I_tr_cca, T_tr_cca = cca.transform(I_tr, T_tr) I_te_cca, T_te_cca = cca.transform(I_te, T_te) return I_tr_cca, T_tr_cca, I_te_cca, T_te_cca
def train_test(self, X, y, X_test): """ """ sss = StratifiedShuffleSplit(y, 1, test_size=0.5) for train_id, valid_id in sss: X0, X1 = X[train_id], X[valid_id] y0, y1 = y[train_id], y[valid_id] #First half w0 = np.zeros(len(y0)) for i in range(len(w0)): w0[i] = self.w[int(y0[i])] xg0_train = DMatrix(X0, label=y0, weight=w0) xg0_test = DMatrix(X1, label=y1) xgt_test = DMatrix(X_test) bst0 = my_train_xgboost(self.param, xg0_train, self.num_round) y0_pred = bst0.predict(xg0_test).reshape(X1.shape[0], 9) yt_pred = bst0.predict(xgt_test).reshape(X_test.shape[0], 9) #Calibrated RF rf = RandomForestClassifier(n_estimators=600, criterion='gini', class_weight='auto', max_features='auto') cal = CalibratedClassifierCV(rf, method='isotonic', cv=3) cal.fit(X0, y0) y0_cal = cal.predict_proba(X1) yt_cal = cal.predict_proba(X_test) #Second half ss = StandardScaler() y0_pred = ss.fit_transform(y0_pred) yt_pred = ss.fit_transform(yt_pred) y0_cal = ss.fit_transform(y0_cal) yt_cal = ss.fit_transform(yt_cal) X1 = np.hstack((X1, y0_pred, y0_cal)) X_test = np.hstack((X_test, yt_pred, yt_cal)) w1 = np.zeros(len(y1)) # self.param['eta'] = 0.01 self.num_round = 450 for i in range(len(w1)): w1[i] = self.w[int(y1[i])] xg1_train = DMatrix(X1, label=y1, weight=w1) xg_test= DMatrix(X_test) bst1 = my_train_xgboost(self.param, xg1_train, self.num_round) y_pred = bst1.predict(xg_test).reshape(X_test.shape[0], 9) return y_pred
def stack_features(params): """ Get local features for all training images together """ # Init detector and extractor detector, extractor = init_detect_extract(params) # Read image names with open( os.path.join(params["root"], params["root_save"], params["image_lists"], params["split"] + ".txt"), "r" ) as f: image_list = f.readlines() X = [] for image_name in image_list: # Read image im = cv2.imread( os.path.join(params["root"], params["database"], params["split"], "images", image_name.rstrip()) ) # Resize image im = resize_image(params, im) feats = image_local_features(im, detector, extractor) # Stack all local descriptors together if feats is not None: if len(X) == 0: X = feats else: X = np.vstack((X, feats)) if params["normalize_feats"]: X = normalize(X) if params["whiten"]: pca = PCA(whiten=True) pca.fit_transform(X) else: pca = None # Scale data to 0 mean and unit variance if params["scale"]: scaler = StandardScaler() scaler.fit_transform(X) else: scaler = None return X, pca, scaler
def main(): df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header = None, sep = '\s+') df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] print(df.head()) # Select a subset of the features and plot the correlation between features cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV'] sns.pairplot(df[cols], size=2.5); plt.title('Correlations between 5 features') plt.show() # Plot a heatmap of the same subset of features cm = np.corrcoef(df[cols].values.T) sns.set(font_scale=2.5) hm = sns.heatmap(cm, cbar = True, annot = True, square = True, fmt = '.2f', annot_kws = {'size': 15}, yticklabels = cols, xticklabels = cols) plt.show() X = df[['RM']].values y = df['MEDV'].values sc_x = StandardScaler() sc_y = StandardScaler() X_std = sc_x.fit_transform(X) y_std = sc_y.fit_transform(y) lr = LinearRegressionGD() lr.fit(X_std, y_std) plt.plot(range(1, lr.n_iter + 1), lr.cost_) plt.ylabel('SSE') plt.xlabel('Epoch') plt.show() lin_regplot(X_std, y_std, lr) plt.xlabel('Average number of rooms [RM] (standardized)') plt.ylabel('Price in $1000\'s [MEDV] (standardized)') plt.show() # Example classification for a house with 5 rooms num_rooms_std = sc_x.transform([5.0]) price_std = lr.predict(num_rooms_std) print("Price in $1000's: %.3f" % \ sc_y.inverse_transform(price_std))
def train_validate(self, X_train, y_train, X_valid, y_valid): """ """ sss = StratifiedShuffleSplit(y_train, 1, test_size=0.5) for train_id, valid_id in sss: X0_train, X1_train = X_train[train_id], X_train[valid_id] y0_train, y1_train = y_train[train_id], y_train[valid_id] #First half w0_train = np.zeros(len(y0_train)) for i in range(len(w0_train)): w0_train[i] = self.w[int(y0_train[i])] xg0_train = DMatrix(X0_train, label=y0_train, weight=w0_train) xg0_valid = DMatrix(X1_train, label=y1_train) xgv_valid = DMatrix(X_valid, label=y_valid) watchlist = [(xg0_train,'train'), (xg0_valid, 'validation0')] # bst0 = train(self.param, xg0_train, self.num_round, watchlist) bst0 = my_train_xgboost(self.param, xg0_train, self.num_round, watchlist) y0_pred = bst0.predict(xg0_valid).reshape(X1_train.shape[0], 9) yv_pred = bst0.predict(xgv_valid).reshape(X_valid.shape[0], 9) #Calibrated RF rf = RandomForestClassifier(n_estimators=600, criterion='gini', class_weight='auto', max_features='auto') cal = CalibratedClassifierCV(rf, method='isotonic', cv=3) cal.fit(X0_train, y0_train) y0_cal = cal.predict_proba(X1_train) yv_cal = cal.predict_proba(X_valid) #Second half ss = StandardScaler() y0_pred = ss.fit_transform(y0_pred) yv_pred = ss.fit_transform(yv_pred) y0_cal = ss.fit_transform(y0_cal) yv_cal = ss.fit_transform(yv_cal) X1_train = np.hstack((X1_train, y0_pred, y0_cal)) X_valid = np.hstack((X_valid, yv_pred, yv_cal)) w1_train = np.zeros(len(y1_train)) # self.param['eta'] = 0.05 self.num_round = 450 for i in range(len(w1_train)): w1_train[i] = self.w[int(y1_train[i])] xg1_train = DMatrix(X1_train, label=y1_train, weight=w1_train) xg_valid = DMatrix(X_valid, label=y_valid) watchlist = [(xg1_train,'train'), (xg_valid, 'validation')] # bst1 = train(self.param, xg1_train, self.num_round, watchlist) bst1 = my_train_xgboost(self.param, xg1_train, self.num_round, watchlist) y_pred = bst1.predict(xg_valid).reshape(X_valid.shape[0], 9) # pdb.set_trace() return y_pred
def perform_scaling (features, scaling = 'standard') : if (scaling == 'standard') : print ("Performing standard scaling") scaler = StandardScaler() else : print ("Performing min-max scaling") scaler = MinMaxScaler() scaler.fit_transform(features) print ("Completed %s Scaler fit!" %(scaling)) return features
def main(): if REDUCE_SIZE: TEST_OUTPUT_DATA_FILE=os.path.join(OUTPUT_DATA_PATH, 'test_RS.csv') TRAIN_OUTPUT_DATA_FILE=os.path.join(OUTPUT_DATA_PATH, 'train_RS.csv') else: TEST_OUTPUT_DATA_FILE=os.path.join(OUTPUT_DATA_PATH, 'test_FS.csv') TRAIN_OUTPUT_DATA_FILE=os.path.join(OUTPUT_DATA_PATH, 'train_FS.csv') # # Process Training Data # training_data = processs_image_data(TRAINING_FILE, reduce_size = REDUCE_SIZE, is_test_data = False) column_names = list(training_data.columns) # # Scale (z-score) features, save scale tranform and to use with test data # y = training_data['label'] X = training_data.drop('label', axis=1) scalar = StandardScaler().fit(X) X = scalar.fit_transform(X) scaled_data = np.column_stack((y, X)) scaled_training_data = pd.DataFrame(data=scaled_data, columns=column_names) scaled_training_data.to_csv(TRAIN_OUTPUT_DATA_FILE, index=False) print('Samples: %d, attributes: %d' %(scaled_training_data.shape[0], scaled_training_data.shape[1])) print('Training Data saved to %s' % (TRAIN_OUTPUT_DATA_FILE)) # # Process Test Data # test_data = processs_image_data(TEST_FILE, reduce_size = REDUCE_SIZE, is_test_data = True) column_names = list(test_data.columns) # # Apply scaling transform # scaled_data = scalar.fit_transform(test_data) scaled_test_data = pd.DataFrame(data=scaled_data, columns=column_names) scaled_test_data.to_csv(TEST_OUTPUT_DATA_FILE, index=False) print('Samples: %d, attributes: %d' %(scaled_test_data.shape[0], scaled_test_data.shape[1])) print('Test Data saved to %s' % (TEST_OUTPUT_DATA_FILE))
def fit_svm(train_y, train_x, test_x, c=None, gamma=None): """ Returns a DataFrame of svm results, containing prediction strain labels and printing the best model. The model's parameters will be tuned by cross validation, and accepts user-defined parameters. Parameters ---------- train_y: pandas.Series labels of classification results, which are predicted strains. train_x: pandas.DataFrame features used to predict strains in training set test_x: pandas.DataFrame features used to predict strains in testing set c: list, optional tuning parameter of svm, which is penalty parameter of the error term gamma: list, optional tuning parameter of svm, which is kernel coefficient Returns ---------- svm results: pandas.DataFrame Prediction strain labels """ # input validation if c is not None: if not isinstance(c, list): raise TypeError("c should be a list") if gamma is not None: if not isinstance(gamma, list): raise TypeError("gamma should be a list") # creat svm model scaler = StandardScaler() train_x = scaler.fit_transform(train_x) Cs = c Gammas = gamma if c is None: Cs = list(np.logspace(-6, -1, 10)) if gamma is None: Gammas = list(np.linspace(0.0001, 0.15, 10)) svc = svm.SVC() clf = GridSearchCV(estimator=svc, param_grid=dict(C=Cs, gamma=Gammas), n_jobs=-1) clf.fit(train_x, train_y) clf = clf.best_estimator_ # fit the best model clf.fit(train_x, train_y) # predict the testing data and convert to data frame prediction = clf.predict(scaler.fit_transform((test_x))) prediction = pd.DataFrame(prediction) prediction.columns = ['predict_strain'] print('The best SVM Model is:') print(clf) return prediction
def artificial_linear(ldsvm, X, y): params_dist_svm = { "C" : [1, 2**2, 5, 2**3, 2**4], "c" : [5, 10, 12, 16, 18], "max_iter" : [400], "step" : [10] } params_svm = { "C" : [0.1, 0.3, 0.5, 1, 2, 3, 6, 2**3], "max_iter" : [400], "penalty" : ['l1'], "dual" : [False] } local_risk, central_risk, ldsvm_risk = get_risks(ldsvm, params_svm, params_dist_svm, X, y) print(">-------------Best Risks from Grid Search---------------------<") print("Risk Local --> ", local_risk) print("Risk LDSVM --> ", ldsvm_risk) print("Risk Central --> ", central_risk) gs = GridSearchCV(LinearSVC(), params_svm) scaler = StandardScaler() ldsvm.network.split_data(X, y, stratified=False) local_data = str(datas_path) + "/data_0.csv" local_class = str(datas_path) + "/class_0.csv" X_local = pd.read_csv(local_data).values y_local = pd.read_csv(local_class).values.T[0] X_local_scale = scaler.fit_transform(X_local) X_scale = scaler.fit_transform(X) params_dist_best = ldsvm.grid_search(X, y, params_dist_svm, stratified=False) gs.fit(X_local, y_local) params_local_best = gs.best_params_ gs.fit(X, y) params_central_best = gs.best_params_ ldsvm.set_params(**params_dist_best) ldsvm.fit(X_scale, y, stratified=False) local_model = LinearSVC(**params_local_best).fit(X_local_scale, y_local) central_model = LinearSVC(**params_central_best).fit(X_scale, y) print(">-------------Best Parameters for Whole data Set--------------<") print("Parameters Local --> ", params_local_best) print("Parameters LDSVM --> ", params_dist_best) print("Parameters Central -->", params_central_best) analysis.plot_planes(X, y, local_model, central_model, ldsvm) analysis.plot_dispersion(ldsvm)
def make_scaler(subject): raw = [] fnames = glob('../data/subj%d_series[1-7]_data.csv' % (subject)) for fname in fnames: data, _ = prepare_data_train(fname) raw.append(data) X = pd.concat(raw) X = np.asarray(X.astype(float)) scaler = StandardScaler() scaler.fit_transform(X) return scaler
def logistic_regression(x_train,y_train,x_test,penalty='L2', regularization=1.0, do_CV=False): from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import KFold ### Mean Normalize variables before regression ### from sklearn.preprocessing import StandardScaler ss=StandardScaler() x_train=ss.fit_transform(x_train) x_test=ss.fit_transform(x_test) lr=LogisticRegression() if penalty=='L1': lr = LogisticRegression(penalty='l1') filename="Lasso_submission.csv" else: lr = LogisticRegression(penalty='l2') filename="Ridge_submission.csv" if do_CV: Cs=np.logspace(-1.5, 1.5, 10) cv_list=list() ### Fit lasso to various choices of regularization parameter C to select optimal C for c in Cs: lr.C = c print 'Running K-fold CV with C = %.5f' % (1.0/c) cv_scores=udf.cross_val_score_proba(x_train,y_train,5,lr) cv_list.append(np.mean(cv_scores)) print 'Best lambda based on Cross-Validation...' max_score=np.max(cv_list) max_lambda=Cs[cv_list.index(max_score)] print 1.0/max_lambda, max_score else: print 'Making prediction with optimal lambda....' lr.C=1.0/regularization lr.fit(x_train,y_train) y_pred=lr.predict_proba(x_test)[:,1] print 'Coefficients of the regression:' print lr.coef_ print 'Writing submission file....' with open(filename,'wb') as testfile: w=csv.writer(testfile) w.writerow(('Id','Probability')) for i in range(len(y_pred)): w.writerow(((i+1),y_pred[i])) testfile.close() print 'File written to disk...'
def _create_features(self): standard_scaler = StandardScaler() # throw away 1st point train = self.train_features.diff().iloc[1:] test = self.test_features.diff().iloc[1:] scaled_train = pd.DataFrame(index=train.index, data=standard_scaler.fit_transform(train.values)) scaled_test = pd.DataFrame(index=test.index, data=standard_scaler.fit_transform(test.values)) self.normalized_differenced_train_features = scaled_train self.normalized_differenced_test_features = scaled_test return
def main(): data = np.genfromtxt('housing.csv', delimiter=',') data = np.hstack((np.ones((data.shape[0], 1)), data)) # indexes = np.random.permutation(data.shape[0]) # data = data[indexes, :].astype(float) c = 400 train_x = data[:-1, :c].T train_y = data[-1, :c] sc_x = StandardScaler() sc_y = StandardScaler() X_std = sc_x.fit_transform(train_x) y_std = sc_y.fit_transform(train_y) m, n = train_x.shape train_y = train_y.reshape(m, 1) test_x = data[:-1, c + 1:].T test_y = data[-1, c + 1:] test_y = test_y.reshape(test_y.shape[0], 1) theta = np.random.random(n).reshape(n, 1) res = fmin_cg(linear_regression, theta, fprime=gradient, args=(X_std, y_std, m, n), maxiter=200, disp=True) Theta = res print('Theta: %s' % str(Theta)) actual_prices = y_std predicted_prices = X_std.dot(Theta.T).reshape(train_x.shape[0], 1) train_rms = math.sqrt(np.power(predicted_prices - actual_prices, 2).mean()) print('RMS training error: %f' % (train_rms)) test_x_std = sc_x.transform(test_x) test_y_std = sc_y.transform(test_y) actual_prices = test_y_std predicted_prices = test_x_std.dot(Theta.T).reshape(test_x_std.shape[0], 1) test_rms = math.sqrt(np.power(predicted_prices - actual_prices, 2).mean()) print('RMS testing error: %f' % (test_rms)) plot(actual_prices, predicted_prices)
def transformTestData(self, train_data, test_data): #Select the right features for both training and testing data X_train, y_train = self.__selectRelevantFeatures(train_data) X_test, y_test = self.__selectRelevantFeatures(test_data) #Transform categorical variables into integer labels martial_le = LabelEncoder() occupation_le = LabelEncoder() relationship_le = LabelEncoder() race_le = LabelEncoder() sex_le = LabelEncoder() transformers = [martial_le, occupation_le, relationship_le, race_le, sex_le] for i in range(len(transformers)): X_train[:,i] = transformers[i].fit_transform(X_train[:,i]) X_test[:,i] = transformers[i].transform(X_test[:,i]) #Dummy code categorical variables dummy_code = OneHotEncoder(categorical_features = range(5)) X_train = dummy_code.fit_transform(X_train).toarray() X_test = dummy_code.transform(X_test).toarray() #Normalize all features scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) #Encode y class_le = LabelEncoder() y_train = class_le.fit_transform(y_train) y_test = class_le.transform(y_test) #print class_le.transform(["<=50K", ">50K"]) return X_train, X_test, y_train, y_test
def buildTreeRegressor(predictorColumns, structurestable = 'structures.csv', targetcolumn = 'c_a', md = None): """ Build a random forest-regressor model to predict some structure feature from compositional data. Will return the model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values """ df = pd.read_csv(structurestable) df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] s = StandardScaler() X = s.fit_transform(df[predictorColumns].astype('float64')) y = df[targetcolumn].values rfr = RandomForestRegressor(max_depth = md) acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error))) X_train, X_test, y_train, y_test = train_test_split(X,y) rfr.fit(X_train,y_train) y_predict = rfr.predict(X_test) t = pd.DataFrame({'True':y_test, 'Predicted':y_predict}) rfr.fit(X, y) return rfr, t, round(acc,2)
def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv', targetcolumn = 'pointGroup', md = None): """ Build a random forest-classifier model to predict some structure feature from compositional data. Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object """ df = pd.read_csv(structurestable) df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] s = StandardScaler() le = LabelEncoder() X = s.fit_transform(df[predictorColumns].astype('float64')) y = le.fit_transform(df[targetcolumn].values) rfc = RandomForestClassifier(max_depth = md) acc = mean(cross_val_score(rfc, X, y)) X_train, X_test, y_train, y_test = train_test_split(X,y) rfc.fit(X_train,y_train) y_predict = rfc.predict(X_test) cm = confusion_matrix(y_test, y_predict) cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_) rfc.fit(X, y) return rfc, cm, round(acc,2), le
def buildCoordinationTreeRegressor(predictorColumns, element, coordinationDir = 'coordination/', md = None): """ Build a coordination predictor for a given element from compositional structure data of structures containing that element. Will return a model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values """ try: df = pd.read_csv(coordinationDir + element + '.csv') except Exception: print 'No data for ' + element return None, None, None df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] if(len(df) < 4): print 'Not enough data for ' + element return None, None, None s = StandardScaler() X = s.fit_transform(df[predictorColumns].astype('float64')) y = df['avgCoordination'].values rfr = RandomForestRegressor(max_depth = md) acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error))) X_train, X_test, y_train, y_test = train_test_split(X,y) rfr.fit(X_train,y_train) y_predict = rfr.predict(X_test) t = pd.DataFrame({'True':y_test, 'Predicted':y_predict}) rfr.fit(X, y) return rfr, t, round(acc,2)
def train_and_test(train_books, test_books, train, scale=True): X_train, y_train, cands_train, features = get_pair_data(train_books, True) X_test, y_test, cands_test, features = get_pair_data(test_books) scaler = None if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) print sum(y_train)*0.1/len(y_train) print 'Start training' print X_train.shape clf = train(X_train, y_train) print 'Done training' y_train_pred = clf.predict(X_train) y_test_pred = clf.predict(X_test) ''' # print performance for training books print "--------------Traning data-------------" train_perf = evaluate_books(clf, train_books, scaler, evaluate_pair) # print performance for testing books print "\n" print "--------------Testing data-------------" test_perf = evaluate_books(clf, test_books, scaler, evaluate_pair) ''' print 'Train Non-unique Precision:', precision(y_train_pred, y_train), 'Non-unique Recall:', recall(y_train_pred, y_train) print 'Test Non-unique Precision:', precision(y_test_pred, y_test), 'Recall:', recall(y_test_pred, y_test) return clf, scaler, X_train, y_train, X_test, y_test
def main(): t0 = time.time() # start time # output files path TRAINX_OUTPUT = "../../New_Features/train_x_processed.csv" TEST_X_OUTPUT = "../../New_Features/test__x_processed.csv" # input files path TRAIN_FILE_X1 = "../../ML_final_project/sample_train_x.csv" TRAIN_FILE_X2 = "../../ML_final_project/log_train.csv" TEST__FILE_X1 = "../../ML_final_project/sample_test_x.csv" TEST__FILE_X2 = "../../ML_final_project/log_test.csv" # load files TRAIN_DATA_X1 = np.loadtxt(TRAIN_FILE_X1, delimiter=',', skiprows=1, usecols=(range(1, 18))) TEST__DATA_X1 = np.loadtxt(TEST__FILE_X1, delimiter=',', skiprows=1, usecols=(range(1, 18))) TRAIN_DATA_X2 = logFileTimeCount(np.loadtxt(TRAIN_FILE_X2, delimiter=',', skiprows=1, dtype=object)) TEST__DATA_X2 = logFileTimeCount(np.loadtxt(TEST__FILE_X2, delimiter=',', skiprows=1, dtype=object)) # combine files TRAIN_DATA_X0 = np.column_stack((TRAIN_DATA_X1, TRAIN_DATA_X2)) TEST__DATA_X0 = np.column_stack((TEST__DATA_X1, TEST__DATA_X2)) # data preprocessing scaler = StandardScaler() TRAIN_DATA_X = scaler.fit_transform(TRAIN_DATA_X0) TEST__DATA_X = scaler.transform(TEST__DATA_X0) # output processed files outputXFile(TRAINX_OUTPUT, TRAIN_DATA_X) outputXFile(TEST_X_OUTPUT, TEST__DATA_X) t1 = time.time() # end time print "...This task costs " + str(t1 - t0) + " second."
def scalar_transform(x): #print(x) scaler = StandardScaler() #scaler.fit(x) return scaler.fit_transform([x])[0]
from sklearn.linear_model import Lasso from sklearn.linear_model import ElasticNet # eval from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error raw_boston = datasets.load_boston() X = raw_boston.data y = raw_boston.target X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state = 42) std_scale = StandardScaler() X_tn_std = std_scale.fit_transform(X_tn) X_te_std = std_scale.transform(X_te) clf_lr = LinearRegression() clf_lr.fit(X_tn_std, y_tn) print('clf_lr.coef_ : ', clf_lr.coef_) print('clf_lr.intercept_ : ', clf_lr.intercept_) clf_ridge = Ridge(alpha = 1) clf_ridge.fit(X_tn_std, y_tn) print('clf_ridge.coef_ : ', clf_ridge.coef_) print('clf_ridge.intercept_ : ', clf_ridge.intercept_) clf_lasso = Lasso(alpha = 0.01) clf_lasso.fit(X_tn_std, y_tn) print('clf_lasso.coef_ : ', clf_lasso.coef_)
y = dataset.iloc[:, 560].values y=y-1 y_train = keras.utils.to_categorical(y) X = dataset.iloc[:, 0:559] y = dataset.iloc[:, 560] XT = dataset.iloc[:, 0:559].values yT = dataset.iloc[:, 560].values yT=yT-1 y_test = keras.utils.to_categorical(yT) from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(X) x_test = sc.fit_transform(XT) from keras.models import Sequential from keras.layers import Dense from keras.layers import Dense, Dropout, Activation from keras.optimizers import SGD model = Sequential() model.add(Dense(64, activation='relu', input_dim=559)) model.add(Dropout(0.5)) model.add(Dense(64, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(6, activation='softmax')) sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
'req_method', 'req_dir', 'req_http_header', 'status_code', 'bytes_trans' ]) #we only need the IP Address & Status Code dataset = dataset[['IP', 'status_code']] #modifying dataset by aggregating count of status code against IP Address dataset = dataset.groupby( ['IP', 'status_code']).status_code.agg('count').to_frame('Total').reset_index() #We are inserting the Index No as it needs it, otherwise it will give Shape of passed values is (13, 2), indices imply (13, 3) error dataset.insert(0, 'IndexNo', range(len(dataset))) #we are droping IP Column as instead of this we will take the Index No as reference of IP and scale it train_data = dataset.drop(['IP'], axis=1) sc = StandardScaler() scaled_data = sc.fit_transform(train_data) #We have used here 3 as a cluster because it's a good practice to give odd number due to the calculation of points which are crucial between two cluster #This solely depends and varry from data to data model = KMeans(n_clusters=3) pred = model.fit_predict(scaled_data) #here IP_Scaled is actually IndexNo because IP Address is treated as string pred_ds = pd.DataFrame( scaled_data, columns=['IP_Scaled', 'status_code_Scaled', 'Total_Scaled']) pred_ds['Cluster'] = pred ds = pd.concat([dataset, pred_ds], axis=1, sort=False) #Here we are creating graph of Request per IP vs Count Graph = pxp.scatter(ds, 'Total', 'IP', 'Cluster', hover_data=['status_code'],
acc_val.append(acc1/5) #print the best parameters loc=acc_val.index(max(acc_val)) print(max(acc_val)) print(parameters[loc]) #training all the training data and write the parameters of model to file smote=SMOTE(k_neighbors=5) X_train,y_train=smote.fit_resample(TCGA_data,TCGA_label) std=StandardScaler() X_train=std.fit_transform(X_train) #training model ga,ma_num,mi_num=parameters[loc][0],parameters[loc][1],parameters[loc][2] clf_xg=XGBClassifier(gamma=ga,max_depth=ma_num,min_child_weight=mi_num, learning_rate=0.4,booster='gbtree',n_jobs=-1) clf_xg.fit(X_train,y_train) import pickle with open("./model_file/TCGA_clf_xg_all.pickle",'wb') as f: pickle.dump(clf_xg,f) #stander the testing data TCGA_test=pd.read_csv(loc_path+"TCGA_dataset/gene_sel_data/test.csv",header=None)
#df_rune = pd.DataFrame(df_rune)#,index=df_rune[:,0]) print(df_rune) # separate training and test data (70, 30) #X, y = df_rune.iloc[:, 1:].values, df_rune.iloc[:, 0].values X, y = df_rune[:, 1:], df_rune[:, 0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0) # standardize the features sc = StandardScaler() X_train_std = sc.fit_transform(X_train) X_test_std = sc.transform(X_test) cov_mat = np.cov(X_train_std.T) eigen_vals, eigen_vecs = np.linalg.eig(cov_mat) #print('\nEigenvalues {}'.format(eigen_vals)) # with cumsum we can calculate the cumulative sum of expained variances tot = sum(eigen_vals) var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)] cum_var_exp = np.cumsum(var_exp) # Make a list of (eigenvalue, eigenvector) tuples eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i]) for i in range(len(eigen_vals))] # eigen_pairs
forest_clf.fit(X_train, y_train) forest_clf.predict([some_digit]) # In[61]: forest_clf.predict_proba([some_digit]) # In[62]: cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy") # In[63]: from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train.astype(np.float64)) cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy") # In[64]: y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_train_pred) conf_mx # In[65]: def plot_confusion_matrix(matrix): """If you prefer color and a colorbar""" fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111)
import numpy as np import matplotlib.pyplot as plt dataset = pd.read_csv("Social_Network_Ads.csv") features = dataset.iloc[:, [2, 3]].values labels = dataset.iloc[:, 4].values #SPLITTING from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.25, random_state=0) #FEATURE SCALING from sklearn.preprocessing import StandardScaler sc = StandardScaler() features_train = sc.fit_transform(features_train) features_test = sc.transform(features_test) #FITTING LOGISTIC REGRESSION INTO TRAINING SET from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(features_train, labels_train) #PREDICTING THE RESULTS labels_pred = classifier.predict(features_test) #MAKING THE CONFUSION MATRIX from sklearn.metrics import confusion_matrix cm = confusion_matrix(labels_test, labels_pred) #VISUALISING TRAINING DATA SET
import matplotlib.pyplot as plt from sklearn.metrics import precision_recall_curve numeros = skdata.load_digits() target = numeros['target'] imagenes = numeros['images'] n_imagenes = len(target) data = imagenes.reshape((n_imagenes, -1)) x_train, x_test, y_train, y_test = train_test_split(data, target, train_size=0.5) # todo lo que es diferente de 1 queda marcado como 0 y_train[y_train!=1]=0 y_test[y_test!=1]=0 scaler = StandardScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) cov = np.cov(x_train.T) valores, vectores = np.linalg.eig(cov) # pueden ser complejos por baja precision numerica, asi que los paso a reales valores = np.real(valores) vectores = np.real(vectores) # reordeno de mayor a menor ii = np.argsort(-valores) valores = valores[ii] vectores = vectores[:,ii]
X = dataset.iloc[:, [2, 3]].values y = dataset.iloc[:, 4].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting classifier to the Training set from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) classifier.fit(X_train, y_train) # Create your classifier here # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix
# Dataset for decision function visualization: we only keep the first two # features in X and sub-sample the dataset to keep only 2 classes and # make it a binary classification problem. X_2d = X[:, :2] X_2d = X_2d[y > 0] y_2d = y[y > 0] y_2d -= 1 # It is usually a good idea to scale the data for SVM training. # We are cheating a bit in this example in scaling all of the data, # instead of fitting the transformation on the training set and # just applying it on the test set. scaler = StandardScaler() X = scaler.fit_transform(X) X_2d = scaler.fit_transform(X_2d) # ############################################################################# # Train classifiers # # For an initial search, a logarithmic grid with basis # 10 is often helpful. Using a basis of 2, a finer # tuning can be achieved but at a much higher cost. C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv) grid.fit(X, y)
def main(): if os.path.exists(__TRAINED_DATA_SET): df = pd.read_csv(__TRAINED_DATA_SET) else: df = train() X = df.iloc[:, 1:].values y = df.iloc[:, 0].values # Encoding the Dependent Variable labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # Splitting the dataset into the Training set and Test set x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) lda = LDA(n_components=None) x_train = lda.fit_transform(x_train, y_train) x_test = lda.transform(x_test) explained_variance = lda.explained_variance_ratio_ # Fitting Logistic Regression to the Training set classifier = LogisticRegression(random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting K-NN to the Training set classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting SVM to the Training set classifier = SVC(kernel='linear', random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting Kernel SVM to the Training set classifier = SVC(kernel='rbf', random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting Naive Bayes to the Training set classifier = GaussianNB() classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting Decision Tree Classification to the Training set classifier = DecisionTreeClassifier(criterion='entropy', random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting Random Forest Classification to the Training set classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) parameters = [{ 'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] }] grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1) grid_search = grid_search.fit(x_train, y_train) best_accuracy = grid_search.best_score_ best_parameters = grid_search.best_params_ # Fitting Kernel SVM to the Training set classifier = SVC(kernel='rbf', random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))
plt.style.use('seaborn-deep') import matplotlib.cm cmap = matplotlib.cm.get_cmap('plasma') # Reading in data ds = pd.read_csv("Social_Network_Ads.csv") X = ds.iloc[:, [2,3]].values y = ds.iloc[:,4].values # Splitting and scaling from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler X_train, X_test, y_train, y_test = train_test_split(X,y) sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.fit_transform(X_test) # PCA from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components=2, kernel="rbf") X_train = kpca.fit_transform(X_train) X_test = kpca.transform(X_test) # Fitting logistic regression from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix clf = LogisticRegression() clf.fit(X_train, y_train) y_pred = clf.predict(X_test)
def transform(feature): df[feature] = le.fit_transform(df[feature]) #print(df[feature]) #print(le.classes_) cat_df = df.select_dtypes(include='object') for col in cat_df.columns: transform(col) #for col in df.columns: #print(col) scaler = StandardScaler() scaled_df = scaler.fit_transform(df.drop('Attrition', axis=1)) X = scaled_df Y = df['Attrition'].as_matrix() Y = to_categorical(Y) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) #print('x_test=',x_test.shape) np.random.seed(31) rn.seed(31) tf.set_random_seed(31) model = Sequential() model.add(Dense(input_dim=13, units=50, activation='relu'))
time_diff = np.round(time_diff, 4) all_seq[u]['time_diff'] = time_diff.tolist() for c in df.columns[2:]: all_seq[u][c] = df[df['actor_id'] == u][c].values.tolist() ### Store to JSON with open('make_sequence__observ_{}__labeled_{}_{}_{}_{}.json'.format(observ_daterange, label_daterange, try_date, version, desc), 'w') as fp: json.dump(all_seq, fp) ### Restore `clust_index`, `time_diff` to CSV clust_collect = [] diff_collect = [] for u in tqdm(uid): for i in range(0, len(all_seq[u]['clust_index'])): clust_collect.append(all_seq[u]['clust_index'][i]) diff_collect.append(all_seq[u]['time_diff'][i]) df = pd.concat([df, pd.DataFrame({'clust_index': clust_collect, 'time_diff': diff_collect})], axis=1) ### time_diff scaling df['time_diff_scaled'] = sd.fit_transform(df['time_diff'].values.reshape(-1, 1)) for u in tqdm(uid): all_seq[u]['time_diff_scaled'] = df[df['actor_id'] == u]['time_diff_scaled'].values.tolist() with open('make_sequence__observ_{}__labeled_{}_{}_{}_{}.json'.format(observ_daterange, label_daterange, try_date, version, desc), 'w') as fp: json.dump(all_seq, fp) df.to_csv('featureGeneration__observ_{}__labeled_{}_{}_{}_{}.csv'.format(observ_daterange, label_daterange, try_date, version, desc), index=False)
from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler # Load data print('reading train data...') client_aggs = pd.read_csv('../input/groupby_client_aggs.csv') ids = client_aggs['ClientId'] client_aggs['TotalUnits'] = np.log1p(client_aggs['TotalUnits']) client_aggs[ 'CostPerUnit'] = client_aggs['TotalPesos'] / client_aggs['TotalUnits'] client_aggs.drop(['TotalPesos', 'ClientId'], axis=1, inplace=True) client_aggs.fillna(0, inplace=True) scaler = StandardScaler() client_aggs = scaler.fit_transform(client_aggs) print("KMeans...\n") clf1000 = KMeans(n_clusters=1000, n_init=10, max_iter=300, tol=0.0001, verbose=0, random_state=1, copy_x=True, n_jobs=-1) clf250 = KMeans(n_clusters=250, n_init=10, max_iter=300, tol=0.0001, verbose=0,
from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])], remainder="passthrough") X = np.array(ct.fit_transform(X)) # print(X); #Encoding Dependent Variable from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y = np.array(le.fit_transform(y)) # print(y); #Splitting Dataset into Training Set & Test Set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # print(X_train); # print(X_test); # print(y_train); # print(y_test); #Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) X_test[:, 3:] = sc.transform(X_test[:, 3:]) print(X_train) print(X_test)
colnames.append('effective') df.columns = colnames #dropping empty or with 'effective- NaN' column and duplicated rows: df = df.drop([21,22,28,30,50,68,105,106,110,122]) #dealing with Nan df.iloc[:,:-1] = df.iloc[:,:-1].apply(lambda x: pd.factorize(x)[0]) X=df[['start_treat','doxy','ilads','buhner','cowden','liposomal','other_herbs','vitaminD','supp','oil','sugar-free','gluten-free','dairy-free','bioresonance','antimicrobial','oxygen','cannabis_oil','binaural','tobacco','alcohol','coffee','marijuana','other_stim','num_antibiotics','method_antibiotics']].values y=df['effective'].values from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = sc.fit_transform(X) import keras from keras.utils.np_utils import to_categorical y_binary = to_categorical(y) ''' model = DecisionTreeRegressor(max_depth=10) cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error') ''' model = RandomForestRegressor(max_depth=15, n_estimators=25, n_jobs=8) model.fit(X,y_binary)
df.head() dataset['Class']=dataset['Class'].replace(3,0) dataset['Class']=dataset['Class'].replace(1,3) dataset['Class']=dataset['Class'].replace(2,1) dataset['Class']=dataset['Class'].replace(3,2) target = dataset['Class'] df=df.iloc[0:177,[1,12]] sc=StandardScaler() df=sc.fit_transform(df) pca = PCA(n_components=2) pca_x=pca.fit_transform(df) pca_df = pd.DataFrame(data=pca_x,columns=['comp1','comp2']) KModel = KMeans(n_clusters=3,random_state=2) KModel.fit_predict(pca_df) KModel.labels_ colormap = np.array(['Red','Blue','Green']) z = plt.scatter(pca_df.comp1,pca_df.comp2,c = colormap[KModel.labels_])
"""Checking for Quasi-Constant Features""" occ = x.loc[x.promotion_last_5years == 0, 'promotion_last_5years'].count() number_of_occ_per = occ/x.shape[0] * 100 print(str(number_of_occ_per) + '%') occ = x.loc[x.Work_accident == 0, 'Work_accident'].count() number_of_occ_per = occ/x.shape[0] * 100 print(str(number_of_occ_per) + '%') """Standard Scaling all the features to come under a common range.""" from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() x = sc_x.fit_transform(x) x y """Inference : <br> The Data is Imbalanced. So, we must use ensemble learning methods and cross validation to avoid overfitting. # 8. Splitting into Train and Test Sets """ y.shape from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)
from sklearn.preprocessing.imputation import Imputer imr = Imputer(missing_values='NaN', strategy='mean', axis=1) imr = imr.fit(x_train) imputed_data = imr.transform(x_train.values) print('trasf') print(imputed_data[:200]) imr = Imputer(missing_values='NaN', strategy='mean', axis=1) imr = imr.fit(x_test) imputed_data2 = imr.transform(x_test.values) print('trasf') print(imputed_data2[:200]) std = StandardScaler() x_train_std = std.fit_transform(imputed_data) x_test_std = std.fit_transform(imputed_data2) print(x_train_std) print(imputed_data.shape) # rete neurale # modello neurale model1 = Sequential() model1.add(layers.Dense(50, input_dim=9, activation='relu')) model1.add(layers.Dense(40, activation='relu')) model1.add(layers.Dense(30, activation='relu')) model1.add(layers.Dense(25, activation='relu')) model1.add(layers.Dense(2, activation='sigmoid'))
# Importing the dataset dataset = pd.read_csv('Social_Network_Ads.csv') X = dataset.iloc[:, [2, 3]].values y = dataset.iloc[:, 4].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) sc_y = StandardScaler() y_train = sc_y.fit_transform(y_train) #Classifier from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) #predicting test set results y_pred = classifier.predict(X_test) #confusion matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred)
DATA_DIR = "data" AIRQUALITY_FILE = os.path.join(DATA_DIR, "AirQualityUCI.csv") aqdf = pd.read_csv(AIRQUALITY_FILE, sep=";", decimal=",", header=0) # remove columsn. data, time and last two columns del aqdf["Date"] del aqdf["Time"] del aqdf["Unnamed: 15"] del aqdf["Unnamed: 16"] # fill NaNs with the mean value aqdf = aqdf.fillna(aqdf.mean()) Xorig = aqdf.as_matrix() # scale the data scaler = StandardScaler() Xscaled = scaler.fit_transform(Xorig) # store the meand and std to be used after for porediction Xmeans = scaler.mean_ Xstds = scaler.scale_ # the target variable is the fourthn columun y= Xscaled[:,3] # delete the target variable from the input (training data) X = np.delete(Xscaled, 3, axis=1) # split training data inot 70 training and 30 testing train_size = int(0.7*X.shape[0]) Xtrain, Xtest, ytrain, ytest = X[0:train_size], X[train_size:],y[0:train_size], y[train_size:] # define the network, a 2 layer dense netweork takes the 12 features and outputs ascaled prediction # the hidden layer has 8 neurons, initialization, loss function (mse) and optimizer (adam) readings = Input(shape=(12,)) x = Dense(8, activation="relu", kernel_initializer="glorot_uniform")(readings) benzene = Dense(1, kernel_initializer="glorot_uniform")(x)
plt.rcParams['axes.unicode_minus'] = False if __name__ == '__main__': """ 2020-4-17指导: 基于现场46个指标的新数据(铁次),基于PCA的分值看一下规律 """ FILE = '铁次结果_5h滞后处理v3.0_tc.xlsx' N_COMPONENTS = 21 # 主成分个数, 需要先设定 None # 数据读入 input_df = pd.read_excel(FILE, index_col=0, sheet_name='46') # 标准化 scaler = StandardScaler() scaled_np = scaler.fit_transform(input_df) df_scaled = pd.DataFrame(scaled_np, index=input_df.index, columns=input_df.columns) # PCA n = N_COMPONENTS pca = PCA(n) pca.fit(scaled_np) pca.explained_variance_ratio_.cumsum() # 累计值去 0.9 0.95 df_pca = pca.transform(scaled_np) # 打分图 score = df_pca.dot(pca.explained_variance_.reshape( n, 1)) / pca.explained_variance_.sum() max_range = score.shape[0]
X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # Encode categorical varaibles label_encoder_X = LabelEncoder() #X[:,1] = label_encoder_X.fit_transform(X[:,1]) label_encoder_y = LabelEncoder() #y = label_encoder_y.fit_transform(y) # Scale Data st_sc = StandardScaler() print(st_sc.fit(X)) X = st_sc.fit_transform(X) #print(y) # Split X and y into training and testing datasets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0) # Create Model model = Sequential() #model.add(Dense(20, input_dim = 10, activation = 'relu')) #model.add(Dense(80, activation = 'relu')) #model.add(Dense(130, activation = 'relu'))
random_state=7, test_size=0.25) # 拆分训练集 / 验证集 x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state=11) print(x_train.shape, y_train.shape) # 查看样本 print(x_valid.shape, y_valid.shape) print(x_test.shape, y_test.shape) # 归一化 from sklearn.preprocessing import StandardScaler scaler = StandardScaler() x_train_scaler = scaler.fit_transform(x_train) x_valid_scaled = scaler.transform(x_valid) x_test_scaled = scaler.transform(x_test) #############################wide&deep模型的多输入########################################### # 搭建模型 # 多输入 input_wide = keras.layers.Input(shape=[5]) input_deep = keras.layers.Input(shape=[6]) hidden1 = keras.layers.Dense(30, activation='relu')(input_deep) hidden2 = keras.layers.Dense(30, activation='relu')(hidden1) concat = keras.layers.concatenate([input_wide, hidden2]) # 拼接起来 output = keras.layers.Dense(1)(concat) # 函数式调用 model = keras.models.Model(inputs=[input_wide, input_deep], outputs=[output]) # fit前需要拆分模型
previsores[:, 8] = labelEncoder_previsores.fit_transform(previsores[:, 8]) previsores[:, 9] = labelEncoder_previsores.fit_transform(previsores[:, 9]) previsores[:, 13] = labelEncoder_previsores.fit_transform(previsores[:, 13]) #Existe uma ineficiência nessa solução, pois essas variáveis trasnformadas são do tipo nominal #No caso não posso dizer por exemplo que uma raça é melhor que outra onehotencoder = OneHotEncoder(categorical_features=[1, 3, 5, 6, 7, 8, 9, 13]) previsores = onehotencoder.fit_transform(previsores).toarray() labelEncoder_classe = LabelEncoder() classe = labelEncoder_classe.fit_transform(classe) standardScaler = StandardScaler() previsores = standardScaler.fit_transform(previsores) ###########################CRIAÇÃO BASE DE TESTE############################### from sklearn.model_selection import train_test_split previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split( previsores, classe, test_size=0.15, random_state=0) from sklearn.linear_model import LogisticRegression classificador = LogisticRegression() classificador.fit(previsores_treinamento, classe_treinamento) previsoes = classificador.predict(previsores_teste) from sklearn.metrics import confusion_matrix, accuracy_score
json_file = open('model.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights into new model loaded_model.load_weights("model.h5") print("Loaded model from disk") data = genData(1, 50, 50, 1200, 1280, 1320, 0.004, .001) dipData = data[0] qFactorClassification = data[1] x_train, x_test, y_train, y_test = train_test_split(dipData, qFactorClassification, test_size = 0.01, random_state = 0) sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) start = time.time() qFactorPrediction = loaded_model.predict(x_test) end = time.time() print(str((end - start))) plt.plot(y_test, color = 'red', label = 'Theorectical Q-Factor', marker = '.') plt.plot(qFactorPrediction, color = 'blue', label = 'Predicted Q-Factor', marker = '.') plt.title('Model Prediction') plt.legend() plt.show()
lin_reg2.fit(X_poly, Y) # p-value degerleri incelenecek print("Polynomial Reg OLS:") model2 = sm.OLS(lin_reg2.predict(poly_reg.fit_transform(X)),X) print(model2.fit().summary()) print("Polynomial R-square value:") print(r2_score(Y , lin_reg2.predict(poly_reg.fit_transform(X)))) # SVR from sklearn.preprocessing import StandardScaler sc1 = StandardScaler() sc2 = StandardScaler() x_olcekli = sc1.fit_transform(X) y_olcekli = sc2.fit_transform(Y) from sklearn.svm import SVR svr_reg = SVR(kernel = 'rbf') svr_reg.fit(x_olcekli, y_olcekli) print("SVR OLS:") model3 = sm.OLS(svr_reg.predict(x_olcekli),x_olcekli) print(model3.fit().summary()) print("SVR R-square value:") print(r2_score(Y , svr_reg.predict(x_olcekli))) # Decision Tree from sklearn.tree import DecisionTreeRegressor
def main(model_name, params): ## Read data print("Reading features...") train_df = pd.read_csv(features_directory + model_name + "_features.csv", delimiter=',') ids = train_df.id.tolist() train_df = train_df.drop(["id"], axis=1) # Sanity checks: check if ids are from 0 to N if [x for x in range(len(ids))] != ids: print("ERROR: Indices are not ordered!") sys.exit() ## Normalize features print("\nNormalization...") scaler = StandardScaler() train_array = scaler.fit_transform(train_df) ## Make train data print("\nMaking datasets...") triplet_file = "../datasets/train_triplets.txt" ## Make train and validation triplets making sure there are no common images in the train and validation triplets triplets_train, triplets_validation, triplets_test = make_train_validation_test_triplets_list( triplet_file) data_train_1 = make_triplets(train_array, triplets_train) data_validation_1 = make_triplets(train_array, triplets_validation) data_test_1 = make_triplets(train_array, triplets_test) data_train_0 = make_0_triplets(data_train_1) data_validation_0 = make_0_triplets(data_validation_1) data_test_0 = make_0_triplets(data_test_1) data_train_1_in, data_train_1_out, data_train_0_out, data_train_0_in = train_test_split( data_train_1, data_train_0, train_size=0.5) data_validation_1_in, data_validation_1_out, data_validation_0_out, data_validation_0_in = train_test_split( data_validation_1, data_validation_0, train_size=0.5) data_test_1_in, data_test_1_out, data_test_0_out, data_test_0_in = train_test_split( data_test_1, data_test_0, train_size=0.5) n1 = len(data_train_1_in) n0 = len(data_train_0_in) X_train = np.concatenate((data_train_1_in, data_train_0_in), axis=0) y_train = np.array(n1 * [1.] + n0 * [0.]) y_2D_train = np.array(list(map(list, zip(y_train, not_(y_train))))) n1 = len(data_validation_1_in) n0 = len(data_validation_0_in) X_validation = np.concatenate((data_validation_1_in, data_validation_0_in), axis=0) y_validation = np.array(n1 * [1.] + n0 * [0.]) y_2D_validation = np.array( list(map(list, zip(y_validation, not_(y_validation))))) n1 = len(data_test_1_in) n0 = len(data_test_0_in) X_test = np.concatenate((data_test_1_in, data_test_0_in), axis=0) y_test = np.array(n1 * [1.] + n0 * [0.]) y_2D_test = np.array(list(map(list, zip(y_test, not_(y_test))))) ## Shuffle the datasets X_train, y_2D_train = shuffle(X_train, y_2D_train) X_validation, y_2D_validation = shuffle(X_validation, y_2D_validation) ## Make model model = create_model( np.shape(X_train)[2], params["n_units"], params["dropout"]) print("Model summary:") print(model.summary()) model.compile( optimizer='adam', loss='binary_crossentropy', metrics=['acc'], ) print("\nFitting...") history = model.fit( (X_train[:, 0], X_train[:, 1], X_train[:, 2]), y_2D_train, validation_data=((X_validation[:, 0], X_validation[:, 1], X_validation[:, 2]), y_2D_validation), epochs=params["n_epochs"], batch_size=params["batch_size"]) ## Prediction on the test dataset print("\nPredictions for the test sample...") y_pred_proba = model.predict((X_test[:, 0], X_test[:, 1], X_test[:, 2])) auc = roc_auc_score(y_2D_test[:, 0], y_pred_proba[:, 0]) print("ROC AUC: %.2f" % auc) best_cut = 0.5 y_pred = y_pred_proba[:, 0] >= best_cut print("Accuracy: %.3f" % (accuracy_score(y_2D_test[:, 0], y_pred))) ## Control plots # Loss #for variable in ("loss", "acc"): # plt.figure() # plot_var(variable, history) # plt.savefig(variable + ".pdf") # plt.close() ## Load test dataset print("\nPredictions for the test dataset...") triplet_file = "../datasets/test_triplets.txt" X_test = make_triplets_from_file(train_array, triplet_file) y_pred_test_proba = model.predict((X_test[:, 0], X_test[:, 1], X_test[:, 2])) y_pred_test = y_pred_test_proba[:, 0] >= best_cut np.savetxt("submit.txt", y_pred_test, fmt="%d") return