def main(): """intakes, concats, splits, saves""" ## loads and concats files = os.listdir(INPUT_PATH) df_multi = [pd.read_csv(f'{file}') for file in files] df_full = pd.concat(df_multi) ## split df_inter, df_test = train_test_split(df_full, test_size=PERCENT, random_state=SEED) df_train, df_val = train_test_split(df_inter, test_size=PERCENT, random_state=SEED) ## save df_train.to_csv(OUTPUT_PATH_TRAIN) df_val.to_csv(OUTPUT_PATH_VAL) df_test.to_csv(OUTPUT_PATH_TEST)
def get_divided(df): X_train = df.drop('price actual', axis=1).as_matrix() y_train = df['price actual'].as_matrix() X_train, X_test, y_train, y_test = skl.train_test_split(X_train, y_train, test_size=0.2) plt.plot(X_train, y_train, 'ro', label='Original data') plt.title('Linear Regression Result') plt.legend() plt.show() return X_train, X_test, y_train, y_test
def split_data(self, data): """Split data to training and validation set.""" validation_file = Path("../data/valid.p") if validation_file.isfile(): validate = [] with open(validation_file, mode='rb') as f: validate = pickle.load(f) self.X_validate = validate['features'] self.y_validate = validate['labels'] self.X_validate.astype(np.float32) self.y_validate.astype(np.float32) else: self.X_train, self.y_train, self.X_validate, self.y_validate = train_test_split(data['features'], data['labels'], test_size=0.33, random_state=0)
temp_x = pickle.load(open(trainFile + ".feature", "rb")) temp_y = pickle.load(open(trainFile + ".label", "rb")) x.append(temp_x) y.append(temp_y) x = np.array(x) y = np.array(y) print(x.shape) # max_x = np.amax(x.any()) # x = x/max_x x_train, x_test, y_train, y_test = sklearn.train_test_split(x, y, test_size=0.15, random_state=0) x_train, x_val, y_train, y_val = sklearn.train_test_split(x_train, y_train, test_size=0.25, random_state=0) x_dummy = x_train[:5] y_dummy = y_train[:5] def get_siamese_model(input_shape): """ Model architecture """
# y_training dim 30 * 2 (the first is x and the second is y) y_list = [pd.DataFrame()] * 2308 for i in range(2308): y_list[i] = df_train_y.iloc[(i*30):(i*30+30),-2:] # preprocessing to remove samples del X_list[593] del y_list[593] del X_list[858] del y_list[858] del X_list[2166] del y_list[2166] # Cross Validation X_list_tr, X_list_te, y_list_tr, y_list_te = train_test_split(X_list, y_list, size=0.2) # 2308 total samples, 2305 good samples model_list = [ARIMA.ARIMA(learning_rate=1, iterations=1000, l1_penality=1, lag=6)] * 2308 t = time.time() num_test = 2305 # Fit the models for i in range(num_test): model_list[i].fit(X_list[i], y_list[i]) print(str(i) + " is okay.") print("Fitting took %d seconds" % (time.time()-t)) df_test_X = pd.read_csv('../data/test_transformed.csv') X_test_list = [pd.DataFrame()] * 20 for i in range(20):
# Logistic Regression # import numpy as np import pandas as pd import sklearn.model_selection as train_test_split X_train,X_test,y_train,y_test = train_test_split(test_size=0.3,random_state=0) from sklearn.preprocessing import StandardScaler SS=StandardScaler() X_train = SS.fit_transform(X_train) X_test = SS.fit_transform(X_test) from sklearn.linear_model import LogisticRegression clf = LogisticRegression(random_state=0) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) from sklearn.metrics import confusion_matrix cm=confusion_matrix(y_test,y_pred) print(cm) ''' TP FP TP FN CorrectPrediction WrongPreduction WrongPreduction CorrectPrediction Check Accuracy - by using accuracy score sensetivity, specificity and Precision
# import data import numpy as np import pandas as pd df = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data', header=None) print(df.tail()) y = df.iloc[1:, 4] df.drop(df.columns[[2]], axis=1, inplace=True) X = df.iloc[1:, 0:3] #----------------------------------------------------------------------------- # data pre processing from sklearn import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) from sklearn.preprocessing import StandardScaler sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) X_combined_std = np.vstack((X_train_std, X_test_std)) y_combined = np.hstack((y_train, y_test)) #----------------------------------------------------------------------------- from sklearn.linear_model import LogisticRegression import plot_decision_regions as pp import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsClassifier
import pandas from sklearn import train_test_split #from medium article https://towardsdatascience.com/how-to-build-a-simple-song-recommender-296fcbc8c85 #these lines of code did not work on IDE #triplets_file = 'https://static.turi.com/datasets/millionsong/10000.txt' #songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv' #song_df_1 = pandas.read_table(triplets_file,header=None) #song_df_1.columns = ['user_id', 'song_id', 'listen_count'] #song_df_2 = pandas.read_csv(songs_metadata_file) #song_df = pandas.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left") #print('Data imported') song_df = pandas.read_csv('song_df.csv') print(song_df.head()) users = song_df['user_id'].unique() print(len(users)) songs = song_df['song_id'].unique() print(len(songs)) train_data, test_data = train_test_split(song_df, test_size=0.20, random_state=0)
Created on Tue Jul 17 16:12:01 2018 @author: xzc """ # read data from csv file and train_test_split import pandas as pd from sklearn import train_test_split housing = pd.read_csv('cal_housing_clean.csv') housing.head() y_val = housing['medianHouseValue'] x_data = housing.drop('medianHouseValue', axis=1) X_train, X_test, y_train, y_test = train_test_split(x_data, y_val, test_size=0.3, random_state=101) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(X_train) X_train = pd.DataFrame(data=scaler.transform(X_train), columns=X_train.columns, index=X_train.index) X_test = pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns, index=X_test.index) scaler.transform(X_train) # create Feature columns housing.columns
generated_filepath = '../Generated/' n_signs = 29 y_base = np.zeros((n_signs,), dtype=np.int) row_count = 0; for sign_count in range(0:n_signs) inputdir = generated_filepath + sign_count + '/' y_sign = y_base y_sign(sign_count) = 1; for image in os.listdir(inputdir) img_matrix = imread(image) img_array = img_matrix.ravel() y_matrix(row_count,:) = y_sign x_matrix(row_count,:) = img_array row_count++ #full_matrix = np.concatenate((x_matrix,y_matrix.T), axis=1) #full_matrix = np.random.shuffle(full_matrix) #[rows, columns] = np.shape(full_matrix) #x_matrix = full_matrix(:,0:columns-n_signs-1) #y_matrix = full_matrix(:,columns-n_signs:columns-1) X_train, X_test, Y_train, Y_test = train_test_split(x_matrix,y_matrix,test_size=0.33) clf = svm.SVC() clf.fit(X_train,Y_train) accuracy = clf.score(X_test,Y_test)