from sklearn.svm import LinearSVC from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler from joblib import dump, load from data_handling import load_user_data, load_some_user_data, \ split_features_labels, user_train_test_split from metrics import balanced_accuracy_score, single_balanced_accuracy_score from flex_one_vs_rest_classifier import FlexOneVsRestClassifier if __name__ == "__main__": start = time.time() # load data and reset index data = load_user_data("../../data") data.reset_index(inplace=True) X, y = split_features_labels(data) attrs = X.columns labels = y.columns X = X.values y = y.values # save save uuid for stratified train-test-split le = LabelEncoder() le.fit(X[:, 0]) strat_classes = le.transform(X[:, 0]) X_train, X_test, y_train, y_test = user_train_test_split(X, y, test_size=0.2,
import sys import pandas as pd from joblib import load from data_handling import load_user_data if __name__ == "__main__": args = sys.argv data_path = args[1] classifier_path = args[2] output_path = args[3] # read and process training data data = load_user_data(data_path) data.reset_index(inplace=True) feature_names = data.columns # drop uuid column, the timestamps and the label source if "label_source" in data.columns: data = data.drop(["label_source"], axis=1) # drop uuid column, the timestamps and the label source X = data.drop(['level_0', 'level_1', 'timestamp'], axis=1) clf = load(classifier_path) y_pred = clf.predict(X) df = pd.DataFrame(y_pred, columns=clf.label_names) df.to_csv(output_path)
import xgboost as xgb import matplotlib.pyplot as plt import pandas as pd from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score, confusion_matrix from sklearn.pipeline import Pipeline from data_handling import load_user_data, split_features_labels if __name__ == '__main__': # load data and reset index user_data = load_user_data() features_df, labels_df = split_features_labels(user_data) del user_data features_df.reset_index(inplace=True) # get uuid column and remove them, the timestamps and the label_source from # the labels y = features_df.iloc[:, 0].values index_cols = features_df.columns[[0, 1, 2, -1]] features_df.drop(index_cols, axis=1, inplace=True) feature_names = features_df.columns X = features_df # train-test split and wrap output in data frame to save column names X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)