def predictGaitCycle(df): X = df.drop( { 'Young_Slow', 'Young_Medium', 'Young_Fast', 'Adult_Slow', 'Adult_Medium', 'Adult_Fast' }, axis=1) YS = df['Young_Slow'] YM = df['Young_Medium'] YF = df['Young_Fast'] AS = df['Adult_Slow'] AM = df['Adult_Medium'] AF = df['Adult_Fast'] XYS = pd.concat([X, YM, YF], axis=1) XYM = pd.concat([X, YS, YF], axis=1) XYF = pd.concat([X, YS, YM], axis=1) XAS = pd.concat([X, AM, AF], axis=1) XAM = pd.concat([X, AS, AF], axis=1) XAF = pd.concat([X, AS, AM], axis=1) degree = 2 #young slow regreesion by polynomial method X_train, X_test, Y_train, Y_test = ttsplit(X, YS, test_size=0.3) poly = PolynomialFeatures(degree) printGraph(X_train, X_test, Y_train, Y_test, poly, X) x_poly_train = poly.fit_transform(X_train) x_poly_test = poly.fit_transform(X_test) poly.fit(x_poly_train, Y_train) reg = lr() reg.fit(x_poly_train, Y_train) y_pred = reg.predict(x_poly_test) mae = mean_absolute_error(Y_test, y_pred) mse = mean_squared_error(Y_test, y_pred) rmse = np.sqrt(mse) r2 = r2_score(Y_test, y_pred) print(mae, mse, rmse, r2, sep=' ') #YOUNG SLOW WALK USING MEDIUM AND FAST WALK by polynomial X_train, X_test, Y_train, Y_test = ttsplit(XYS, YS, test_size=0.3) poly = PolynomialFeatures(degree) x_poly_train = poly.fit_transform(X_train) x_poly_test = poly.fit_transform(X_test) poly.fit(x_poly_train, Y_train) reg = lr() reg.fit(x_poly_train, Y_train) y_pred = reg.predict(x_poly_test) mae = mean_absolute_error(Y_test, y_pred) mse = mean_squared_error(Y_test, y_pred) rmse = np.sqrt(mse) r2 = r2_score(Y_test, y_pred) print(mae, mse, rmse, r2, sep=' ') return True
def train_incremental(X, y): # split data into training and testing sets # then split training set in half # the first part is used as orginal data # the second part is used as incremental data X_train, X_test, y_train, y_test = ttsplit(X, y, test_size=0.1, random_state=0) X_train_origin, X_train_incremental, y_train_origin, y_train_incremental = ttsplit( X_train, y_train, test_size=0.5, random_state=0) xg_train_origin = xgb.DMatrix(X_train_origin, label=y_train_origin) xg_train_incremental = xgb.DMatrix(X_train_incremental, label=y_train_incremental) xg_test = xgb.DMatrix(X_test, label=y_test) # ================= xgboost classification model ====================# params = {'objective': 'multi:softmax', 'num_class': len(category)} params['silent'] = 1 num_round = 30 model_origin = xgb.train(params, xg_train_origin, num_round) model_origin.save_model('xgb_model.model') # ================= train two versions of the model =====================# model_none_incremental = xgb.train(params, xg_train_incremental, num_round) model_incremental = xgb.train(params, xg_train_incremental, num_round, xgb_model='xgb_model.model') # benchmark pred_origin = model_origin.predict(xg_test) score = metrics.accuracy_score(y_test, pred_origin) f1 = metrics.f1_score(y_test, pred_origin, average='weighted') print('original model accuracy of %0.3f, and f1 score of %0.3f' \ % (score, f1)) # "before" pred_none_incremental = model_none_incremental.predict(xg_test) score = metrics.accuracy_score(y_test, pred_none_incremental) f1 = metrics.f1_score(y_test, pred_none_incremental, average='weighted') print('none incremental model accuracy of %0.3f, and f1 score of %0.3f' \ % (score, f1)) # "after" pred_incremental = model_incremental.predict(xg_test) score = metrics.accuracy_score(y_test, pred_incremental) f1 = metrics.f1_score(y_test, pred_incremental, average='weighted') print('incremental model accuracy of %0.3f, and f1 score of %0.3f' \ % (score, f1))
def get_best_booster(target_variable, max_interactions, df, astro_columns): booster = None best_score = 1 best_booster = None for current_run in range(max_interactions): X = df[astro_columns].values Y = df[target_variable].values total_test = xgb.DMatrix(X, feature_names=astro_columns) X_train_1, X_train_2, y_train_1, y_train_2 = ttsplit(X, Y, test_size=0.3, random_state=None, shuffle=True) booster = create_booster_swing_trade(ETA, DEPTH, NUM_TREES, X_train_1, y_train_1, X_train_2, y_train_2, astro_columns, booster) current_score = mse(booster.predict(total_test), Y) if current_score < best_score: best_score = current_score best_booster = booster gc.collect() print("{} - {} of {}, {}".format(target_variable, current_run, max_interactions, best_score)) if best_score < MIN_PRECISION: break return best_booster, best_score
def train_test_split(dataframe): class_new = dataframe['class'] dataframe = dataframe.drop(columns=["class"]) X_train, X_test, y_train, y_test = ttsplit(dataframe, class_new, test_size=0.3, random_state=42) Data = [X_train, X_test, y_train, y_test] return Data
def train_test_split(dataframe): class_new = dataframe['Class'] X_train, X_test, y_train, y_test = ttsplit(dataframe, class_new, test_size=0.3, random_state=42) Data = [X_train, X_test, y_train, y_test] return Data """
def train(model_name, optimizer_name, scheduler_name, lr, img_path, mask_path, names_path, epochs=10): model = models.getModel(model_name) model.build((None, None, None, 3)) print(model.summary()) scheduler = schedulers.getScheduler(scheduler_name, lr) optimizer = optimizers.getOptimizer(optimizer_name, scheduler) cce = tf.keras.losses.CategoricalCrossentropy() train_loss_metric = tf.keras.metrics.Mean() train_accuracy_metric = tf.keras.metrics.CategoricalAccuracy() test_loss_metric = tf.keras.metrics.Mean() test_accuracy_metric = tf.keras.metrics.CategoricalAccuracy() file_list = open(names_path, 'r') names = file_list.read().splitlines() file_list.close() current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = 'logs/gradient_tape/' + current_time + '/train' test_log_dir = 'logs/gradient_tape/' + current_time + '/test' train_summary_writer = tf.summary.create_file_writer(train_log_dir) test_summary_writer = tf.summary.create_file_writer(test_log_dir) trainset, testval = ttsplit(names, train_size=0.9) test, val = ttsplit(testval, train_size=0.5) trainset = names print(names) total_step = 0 with tf.device('/device:GPU:0'): for epoch in range(epochs): for step_, batch in enumerate(trainset): total_step += 1 print(total_step) img, mask = utils.genData(batch, mask_path, img_path) with tf.GradientTape() as tape: mask_pred = model(img) loss = cce(mask, mask_pred) train_loss_metric.update_state(loss) train_accuracy_metric.update_state(mask, mask_pred) grads = tape.gradient(loss, model.trainable_weights) optimizer.apply_gradients(zip(grads, model.trainable_weights)) print(total_step) if step_ % 150 == 0: with train_summary_writer.as_default(): tf.summary.scalar('Training Loss', train_loss_metric.result(), step=total_step) tf.summary.scalar('Training Accuracy', train_accuracy_metric.result(), step=total_step) for step, batch in enumerate(val): img_val, mask_val = utils.genData( batch, mask_path, img_path) mask_pred_val = model(img_val) loss_val = cce(mask_val, mask_pred_val) print(loss_val) test_loss_metric.update_state(loss_val) test_accuracy_metric.update_state( mask_val, mask_pred_val) with test_summary_writer.as_default(): tf.summary.scalar('Validation Loss', test_loss_metric.result(), step=total_step) tf.summary.scalar('Validation Accuracy', test_accuracy_metric.result(), step=total_step) print('Epoch: ' + str(epoch) + ' | Batch: ' + str(step) + ' | Training Loss: ' + str(train_loss_metric.result().numpy()) + ' | Training Accuracy: ' + str(train_accuracy_metric.result().numpy())) print('Epoch: ' + str(epoch) + ' | Batch: ' + str(step) + ' | Validation Loss: ' + str(test_loss_metric.result().numpy()) + ' | Validation Accuracy: ' + str(test_accuracy_metric.result().numpy())) train_loss_metric.reset_states() train_accuracy_metric.reset_states() test_loss_metric.reset_states() test_accuracy_metric.reset_states()
def train_test_split(X, Y, test_fraction, random_seed=None): return ttsplit(X, Y, test_size=test_fraction, random_state=random_seed)
#converting the data into vectorized format vect = CountVectorizer(stop_words="english", max_features=10000).fit(final_sent) print("exit") len(vect.get_feature_names()) train_vectorized = vect.transform(final_sent) print("Stage 2 complete") # Oversampling the data to solve the issue of data imbalance sampler = RandomOverSampler(ratio={1: 661902, 0: 661902},random_state=0) X_rs, y_rs = sampler.fit_sample(train_vectorized, youtube_train['tag']) print("Stage Oversampler") #splitting the data for training and testing x_train,x_test,y_train,y_test = ttsplit(X_rs,y_rs,test_size=0.25) print("Stage 3 complete") #Intializing the Random Forest Classifier randomfor = RandomForestClassifier() print("Exit 4") randomfor.fit(x_train,y_train) prediction = randomfor.predict(x_test) #printing the accuracy score print(accuracy_score(y_test,prediction)) print("Stage 5 complete") #printing the overall metrics from sklearn import metrics
# Main data Pipeline data = pd.read_csv('mle_fraud_test.csv', sep=';', index_col=0) # For time sake, we limit the dataset to 500 data points, with about 20 confirmed fraud cases # and about 30 blocked cases for diversity. data = data.iloc[5500:6000] X = data[[c for c in data.columns if c != 'transaction_status']] Y = data['transaction_status'] # Split train/test data with ration 80/20. XTrain, XTest, YTrain, YTest = ttsplit(X, Y, test_size=0.2) importTrainData = fraudDetectionData() importTestData = fraudDetectionData() importTrainData.importData(XTrain, YTrain) importTestData.importData(XTest, YTest) XTrainNorm = importTrainData.normalizeData() XTestNorm = importTestData.normalizeData() # Longest part of the pipeline process, should be optimize in the future. importTrainData.buildPseudoClasses() YTrainFull = importTrainData.getLabels()
#stemming the word i.e. transforming it to the root word and adding it to the temp_sent variable temp_sent = temp_sent + " " + (stemming.stem(word)) #appending the sentence to final_sent to get the stemmed data final_sent.append(temp_sent); #incrementing the variable a = a + 1 #converting the data into vectorized format vect = CountVectorizer(stop_words="english", max_features=10000).fit(final_sent) print("exit") len(vect.get_feature_names()) train_vectorized = vect.transform(final_sent) print("Stage 2 complete") #splitting the data for training and testing x_train,x_test,y_train,y_test = ttsplit(train_vectorized,youtube_train['tag'],test_size=0.25) print("Stage 3 complete") #Initializing Support Vector Machine mysvm = SVC(kernel='linear') #training the data mysvm.fit(x_train,y_train) #using the test data for predicting the results prediction = mysvm.predict(x_test) print("Stage 4 complete") #printing the accuracy score print(accuracy_score(y_test,prediction)) print("Stage 5 complete") #printing the overall metrics from sklearn import metrics
def split_test(X, y): X, Xt, y, yt = ttsplit(X, y, test_size=0.33, random_state=42) X_ = [] y_ = [] for _, y_index in KFold(n_splits=4).split(X): X_.append(X[y_index]) y_.append(y[y_index]) # define model model = MultiOutputRegressor(XGBRegressor()) model2 = MultiOutputRegressor(XGBRegressor()) print("Test 1") model.fit(X, y) # make a prediction print("\tFit all X and y:", mean_squared_error(yt, model.predict(Xt))) model2.fit(X_[0], y_[0]) model3 = deepcopy(model2) # make a prediction print("\tFit only X_[0] and y_[0]:", mean_squared_error(yt, model2.predict(Xt))) model3.partial_fit(X_[1], y_[1]) model4 = deepcopy(model3) # make a prediction print("\tFit partial X_[1] and y_[1]:", mean_squared_error(yt, model3.predict(Xt))) model4.partial_fit(X_[2], y_[2]) model5 = deepcopy(model4) # make a prediction print("\tFit partial X_[2] and y_[2]:", mean_squared_error(yt, model4.predict(Xt))) model5.partial_fit(X_[3], y_[3]) # make a prediction print("\tFit partial X_[3] and y_[3]:", mean_squared_error(yt, model5.predict(Xt))) print("Test 2") # define model model = MultiOutputRegressor(XGBRegressor()) model2 = MultiOutputRegressor(XGBRegressor()) # fit all the model model.fit(X, y) # make a prediction print("\tFit all X and y:", mean_squared_error(yt, model.predict(Xt))) model2.fit(flatten(X_[0:1]), flatten(y_[0:1])) model3 = deepcopy(model2) # make a prediction print("\tFit only X_[0:1] and y_[0:1]:", mean_squared_error(yt, model2.predict(Xt))) model3.partial_fit(flatten(X_[0:2]), flatten(y_[0:2])) model4 = deepcopy(model3) # make a prediction print("\tFit partial X_[0:2] and y_[0:2]:", mean_squared_error(yt, model3.predict(Xt))) model4.partial_fit(flatten(X_[1:3]), flatten(y_[1:3])) model5 = deepcopy(model4) # make a prediction print("\tFit partial X_[1:3] and y_[1:3]:", mean_squared_error(yt, model4.predict(Xt))) model5.partial_fit(flatten(X_[2:4]), flatten(y_[2:4])) # make a prediction print("\tFit partial X_[2:4] and y_[2:4]:", mean_squared_error(yt, model5.predict(Xt)))
'max_depth': hp.quniform("max_depth", 3, 20, 1), 'gamma': hp.uniform('gamma', 1, 9), 'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1), 'reg_lambda': hp.uniform('reg_lambda', 0, 1), 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1), 'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1), 'n_estimators': hp.quniform("n_estimators", 100, 200, 5), 'seed': 0 } trials = Trials() best_hyperparams = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=500, trials=trials) print(best_hyperparams) pass X_data, y_data = get_dataset("../agent/configs_xqn/buffer.csv") X, Xt, y, yt = ttsplit(X_data, y_data, test_size=0.30, random_state=42) print(f"Tamanho dos dados de treino: {len(X)}") print(f"Tamanho dos dados de teste: {len(Xt)}") #split_test(X,y) hypertune_parameters()
#######################################主程式(執行...) data_Xtrain = np.load('X_train.npy') data_Ytrain = np.load('y_train.npy') data_Xtest = np.load('X_test.npy') '''可調參數''' imgdata_nmber = len(data_Xtrain) imgsize = 110 #影像降解析(邊長) drop_ = 0.01 #Dropout batch = 100 epoch = 9 test_n = 0.3 #test_size validation_n = 0.3 #validation_split X = data_pross(data_Xtrain[0:imgdata_nmber], imgdata_nmber) / 255.0 Y = to_categorical(data_Ytrain[0:imgdata_nmber]) Xtrain, Xtest, ytrain, ytest = ttsplit(X, Y, random_state=20, test_size=test_n) model_ = Xception_model(imgsize, channel=3, drop=drop_, class_=3) history = model_.fit(Xtrain, ytrain, batch_size=batch, epochs=epoch, validation_split=validation_n) model_.save('task1_cnn.h5') ######################################輸出成果+繪圖 '''成果輸出成 output.csv檔 ''' X__test = data_pross(data_Xtest, len(data_Xtest)) / 255.0 yfit = model_.predict_classes(X__test) with open('output.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Index', 'Pred'])
f = lambda x: 1 if x == True else 0 if x == False else 2 if x == "good" else 0 if x == "bad" else 1 if x == "neutral" or "Unknown" else x adjusted = df_new.applymap(f) a = adjusted.drop(columns=[ 'Gender', 'Unnamed: 0', 'Eye color', 'Hair color', 'Race', 'Publisher', 'Height', 'Skin color', 'Weight' ]) a = a.dropna() # Separate dependent and independent variables X = a.iloc[:, 1:] Y = a.iloc[:, 0] yonehot = keras.utils.to_categorical(Y) xfloat = X.astype('float32') # Split training and testing data trainx, testx, trainy, testy = ttsplit(xfloat, yonehot, test_size=.2) # Set up NN network = keras.models.Sequential() network.add(keras.layers.Dense(3, input_dim=(167), activation='sigmoid')) sgd = keras.optimizers.SGD(learning_rate=.1) network.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy']) # Run NN network.fit(trainx, trainy, epochs=30, batch_size=128) predicty = network.predict(testx) # Evaluate NN
#Getting the stopwords corpora stopwords_list = stopwords.words('english') print(stopwords_list[:5]) #Loading the final merged dataset youtube_train = pd.read_csv("C:/Users/prate/Desktop/ICT_solution/Data/final_data/final_data.csv",delimiter=',') youtube_train_sen = youtube_train['video_title'] print(youtube_train_sen[1]) #Converting the sentences to string format youtube_train_sen = youtube_train['video_title'].values youtube_train_sen = youtube_train_sen.astype(str) #splitting the data for training and testing x_train,x_test,y_train,y_test = ttsplit(youtube_train_sen,youtube_train['tag'].values,test_size=0.25) #Initializing tokenizer with max number of words set to 5000 tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_texts(x_train) # fit the words to tokenized library X_train = tokenizer.texts_to_sequences(x_train) X_test = tokenizer.texts_to_sequences(x_test) # Adding 1 because of reserved 0 index vocab_size = len(tokenizer.word_index) + 1 maxlen = 100 #Convert all sequences to equal lenghts with max lenght set to 100
model = load_model("model.h5", custom_objects={'Attention':Attention, 'binary_loss':binary_loss}) print("Loading Word2Vec Dictionary. This may take a long time...") w2v = word2vec.KeyedVectors.load_word2vec_format(config.word2VecPath, binary=True) #w2v = pickle.load(open("word2vec.bin", "rb")) print("Loading Questions...") dataFile = "outBoth.csv" dataRows = pd.read_csv(dataFile) print('Extraction Training Features...') X_questions, X_captions, y, errors = extractFeatures(dataRows) y = np.array(y) X_questions_train, X_questions_test, X_captions_train, X_captions_test, y_train, y_test = ttsplit(X_questions, X_captions, y, test_size=0.25, random_state=1) best = test(model.predict([X_questions_test, X_captions_test]), y_test) print("Starting validation accuracy:", 100*best) print("Starting training accuracy:", 100*test(model.predict([X_questions_train, X_captions_train]), y_train)) if len(sys.argv) > 1 and sys.argv[1] == "train": try: for epoch in range(1000): print("Epoch:") model.fit([X_questions_train, X_captions_train],y_train, batch_size=2500, epochs=1, verbose=1) print("Training accuracy:", 100*test(model.predict([X_questions_train, X_captions_train]), y_train)) testacc = test(model.predict([X_questions_test, X_captions_test]), y_test)