Python preprocessing Examples, data_preprocessing.preprocessing Python Examples

Example #1

0

Show file

def main():

    #preprocessing
    dataPre = preprocessing()
    dataPre.process_data()
    xtrain, xtest, ytrain, ytest = dataPre.divide_data()

    #visualizing
    #dataPre.visualize_data()

    #modeling (BP)
    print("Modeling ...")
    my_model = QDA()
    my_model.train(xtrain, ytrain)
    #my_model.test(xtest)
    acc = my_model.get_accuracy(xtest, ytest)
    print("testing accuracy : ", "{0:.4f}".format(acc))
    print("*********************************************")

Example #2

0

Show file

import streamlit as st
from data_preprocessing import preprocessing

df = preprocessing()

keys = [i[1:] for i in list(df.keys())]
option = st.selectbox('Select the type of GPS', (keys))
r_df = df['$' + option]

columns = st.multiselect('Select columns to show', list(r_df.columns))
col_names = [col for col in columns]
r_df = r_df[col_names]

try:
    for i in col_names:
        if float(r_df[i].min()) < float(r_df[i].max()):
            x = st.slider("Choose the range of  " + i, float(r_df[i].min()),
                          float(r_df[i].max()), (float(r_df[i].min()),
                          float(r_df[i].min())), 0.5)
            r_df = r_df[r_df[i].between(x[0], x[1])]
        else:
            st.write("This column is a constant of value `minValueToken`")

except ValueError:
    st.write('You can not choose a range for non-number type')

st.write('You selected:', r_df)

Example #3

0

Show file

import data_preprocessing
import logisticRegression, svmClassifier, kNearestNeighbors, randomForest, gridSearch, caterogicalSelection
from sklearn.model_selection import KFold, cross_val_score

os.chdir(r'C:\Users\Katerina\Documents\GitHub\ml_projects\ad_predictor')
#read data
train_data = pd.read_csv('data/train.csv')

#data balance
score_split = train_data.groupby(['score']).count()
score_split.reset_index(inplace=True)
score_split = pd.DataFrame(score_split, columns=['score', 'V2'])
score_split.rename(columns={'V2': 'count'}, inplace=True)

#data preprocessing
preprocess = data_preprocessing.preprocessing(train_data)
categorical_var = preprocess.split_numerical_categorical()[1]
numerical_var = preprocess.split_numerical_categorical()[0]
low_corr_var = preprocess.numerical_feature_selection(numerical_var)

################Numerical data classifier####################
#logistic regression classifier
score_LR = {}
for C in [0.1, 1, 10, 100, 1000]:
    regr = logisticRegression.logisticRegression(train_data, low_corr_var, C)
    score_LR[C] = regr.classifier()

score_LR_df = pd.DataFrame(list(score_LR.values()),
                           index=list(score_LR.keys()),
                           columns=['precision', 'recall', 'accuracy'])
score_LR_df[['precision0',

Example #4

0

Show file

File: RNN_skorch.py Project: noureldinalaa/depression_detector

from dstoolbox.transformers import Padder2d
from dstoolbox.transformers import TextFeaturizer
import numpy as np
from scipy import stats
from sklearn.datasets import load_files
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from skorch import NeuralNetClassifier
import torch
from torch import nn
F = nn.functional

from data_preprocessing import preprocessing
Dp_training = preprocessing()

from sklearn.utils import shuffle
import pandas as pd

np.random.seed(0)
torch.manual_seed(0)
# torch.cuda.manual_seed(0)

VOCAB_SIZE = 1000  # This is on the low end
MAX_LEN = 50  # Texts are pretty long on average, this is on the low end
USE_CUDA = torch.cuda.is_available(
)  # Set this to False if you don't want to use CUDA
NUM_CV_STEPS = 10  # Number of randomized search steps to perform

steps = [
    ('to_idx', TextFeaturizer(max_features=VOCAB_SIZE)),
    ('pad', Padder2d(max_len=MAX_LEN, pad_value=VOCAB_SIZE, dtype=int)),

Example #5

0

Show file

File: train.py Project: rarriaza/Movielens_recommendation

        tf.keras.callbacks.ModelCheckpoint(
            filepath=models_dir / 'model-{epoch:02d}-{val_loss:.2f}.h5'),
        tf.keras.callbacks.ModelCheckpoint(
            filepath=models_dir /
            'model_weights-{epoch:02d}-{val_loss:.2f}.h5',
            save_weights_only=True)
    ]

    history = train(model,
                    xtrain,
                    ytrain,
                    n_epochs=n_epochs,
                    batch_size=batch_size,
                    callbacks=callbacks)
    return model, history


if __name__ == '__main__':
    xtrain, ytrain, xtest, ytest, movies_df, user_df, reviewed_movies = preprocessing(
    )

    # deep learning
    model, history = run_train(batch_size=64,
                               n_epochs=50,
                               n_features=xtrain.shape[1])

    # random forest
    regr = RandomForestRegressor(max_depth=3, random_state=0, n_estimators=300)
    regr.fit(xtrain, ytrain)
    pickle.dump(regr, open("checkpoints/rf_model.pkl", 'wb'))

Example #6

0

Show file

File: Depression_detector.py Project: noureldinalaa/depression_detector

import bcolz

# get th parent path
base_path = Path.cwd().parent  #Nour
# base_path = Path.cwd() #Adrian
training_positive_path = base_path.joinpath(
    './2017/train/positive_examples_anonymous_chunks')
training_negative_path = base_path.joinpath(
    './2017/train/negative_examples_anonymous_chunks')
test_path = base_path.joinpath('./2017/test')

Dd = Depression_detection(base_path, training_positive_path,
                          training_negative_path, test_path)

Dp_training = preprocessing()
Dp_testing = preprocessing()

RNN_preparation = training_testing()

## Concatenate all the frames for each folder after parsing them
training_positive_dateframe = pd.concat(
    Dd.parse_folder(training_positive_path))
training_negative_dataframe = pd.concat(
    Dd.parse_folder(training_negative_path))
test_dataframe = pd.concat(Dd.parse_folder(test_path))

## add labels to positive and negative subjects training dataset
training_positive_dateframe['LABEL'] = 1
training_negative_dataframe['LABEL'] = 0