def main(): #preprocessing dataPre = preprocessing() dataPre.process_data() xtrain, xtest, ytrain, ytest = dataPre.divide_data() #visualizing #dataPre.visualize_data() #modeling (BP) print("Modeling ...") my_model = QDA() my_model.train(xtrain, ytrain) #my_model.test(xtest) acc = my_model.get_accuracy(xtest, ytest) print("testing accuracy : ", "{0:.4f}".format(acc)) print("*********************************************")
import streamlit as st from data_preprocessing import preprocessing df = preprocessing() keys = [i[1:] for i in list(df.keys())] option = st.selectbox('Select the type of GPS', (keys)) r_df = df['$' + option] columns = st.multiselect('Select columns to show', list(r_df.columns)) col_names = [col for col in columns] r_df = r_df[col_names] try: for i in col_names: if float(r_df[i].min()) < float(r_df[i].max()): x = st.slider("Choose the range of " + i, float(r_df[i].min()), float(r_df[i].max()), (float(r_df[i].min()), float(r_df[i].min())), 0.5) r_df = r_df[r_df[i].between(x[0], x[1])] else: st.write("This column is a constant of value `minValueToken`") except ValueError: st.write('You can not choose a range for non-number type') st.write('You selected:', r_df)
import data_preprocessing import logisticRegression, svmClassifier, kNearestNeighbors, randomForest, gridSearch, caterogicalSelection from sklearn.model_selection import KFold, cross_val_score os.chdir(r'C:\Users\Katerina\Documents\GitHub\ml_projects\ad_predictor') #read data train_data = pd.read_csv('data/train.csv') #data balance score_split = train_data.groupby(['score']).count() score_split.reset_index(inplace=True) score_split = pd.DataFrame(score_split, columns=['score', 'V2']) score_split.rename(columns={'V2': 'count'}, inplace=True) #data preprocessing preprocess = data_preprocessing.preprocessing(train_data) categorical_var = preprocess.split_numerical_categorical()[1] numerical_var = preprocess.split_numerical_categorical()[0] low_corr_var = preprocess.numerical_feature_selection(numerical_var) ################Numerical data classifier#################### #logistic regression classifier score_LR = {} for C in [0.1, 1, 10, 100, 1000]: regr = logisticRegression.logisticRegression(train_data, low_corr_var, C) score_LR[C] = regr.classifier() score_LR_df = pd.DataFrame(list(score_LR.values()), index=list(score_LR.keys()), columns=['precision', 'recall', 'accuracy']) score_LR_df[['precision0',
from dstoolbox.transformers import Padder2d from dstoolbox.transformers import TextFeaturizer import numpy as np from scipy import stats from sklearn.datasets import load_files from sklearn.pipeline import Pipeline from sklearn.model_selection import RandomizedSearchCV from skorch import NeuralNetClassifier import torch from torch import nn F = nn.functional from data_preprocessing import preprocessing Dp_training = preprocessing() from sklearn.utils import shuffle import pandas as pd np.random.seed(0) torch.manual_seed(0) # torch.cuda.manual_seed(0) VOCAB_SIZE = 1000 # This is on the low end MAX_LEN = 50 # Texts are pretty long on average, this is on the low end USE_CUDA = torch.cuda.is_available( ) # Set this to False if you don't want to use CUDA NUM_CV_STEPS = 10 # Number of randomized search steps to perform steps = [ ('to_idx', TextFeaturizer(max_features=VOCAB_SIZE)), ('pad', Padder2d(max_len=MAX_LEN, pad_value=VOCAB_SIZE, dtype=int)),
tf.keras.callbacks.ModelCheckpoint( filepath=models_dir / 'model-{epoch:02d}-{val_loss:.2f}.h5'), tf.keras.callbacks.ModelCheckpoint( filepath=models_dir / 'model_weights-{epoch:02d}-{val_loss:.2f}.h5', save_weights_only=True) ] history = train(model, xtrain, ytrain, n_epochs=n_epochs, batch_size=batch_size, callbacks=callbacks) return model, history if __name__ == '__main__': xtrain, ytrain, xtest, ytest, movies_df, user_df, reviewed_movies = preprocessing( ) # deep learning model, history = run_train(batch_size=64, n_epochs=50, n_features=xtrain.shape[1]) # random forest regr = RandomForestRegressor(max_depth=3, random_state=0, n_estimators=300) regr.fit(xtrain, ytrain) pickle.dump(regr, open("checkpoints/rf_model.pkl", 'wb'))
import bcolz # get th parent path base_path = Path.cwd().parent #Nour # base_path = Path.cwd() #Adrian training_positive_path = base_path.joinpath( './2017/train/positive_examples_anonymous_chunks') training_negative_path = base_path.joinpath( './2017/train/negative_examples_anonymous_chunks') test_path = base_path.joinpath('./2017/test') Dd = Depression_detection(base_path, training_positive_path, training_negative_path, test_path) Dp_training = preprocessing() Dp_testing = preprocessing() RNN_preparation = training_testing() ## Concatenate all the frames for each folder after parsing them training_positive_dateframe = pd.concat( Dd.parse_folder(training_positive_path)) training_negative_dataframe = pd.concat( Dd.parse_folder(training_negative_path)) test_dataframe = pd.concat(Dd.parse_folder(test_path)) ## add labels to positive and negative subjects training dataset training_positive_dateframe['LABEL'] = 1 training_negative_dataframe['LABEL'] = 0