コード例 #1
0
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from joblib import dump, load

from data_handling import load_user_data, load_some_user_data, \
    split_features_labels, user_train_test_split
from metrics import balanced_accuracy_score, single_balanced_accuracy_score
from flex_one_vs_rest_classifier import FlexOneVsRestClassifier

if __name__ == "__main__":
    start = time.time()
    # load data and reset index
    data = load_user_data("../../data")
    data.reset_index(inplace=True)
    X, y = split_features_labels(data)
    attrs = X.columns
    labels = y.columns
    X = X.values
    y = y.values

    # save save uuid for stratified train-test-split
    le = LabelEncoder()
    le.fit(X[:, 0])
    strat_classes = le.transform(X[:, 0])

    X_train, X_test, y_train, y_test = user_train_test_split(X,
                                                             y,
                                                             test_size=0.2,
コード例 #2
0
import sys
import pandas as pd

from joblib import load
from data_handling import load_user_data

if __name__ == "__main__":
    args = sys.argv
    data_path = args[1]
    classifier_path = args[2]
    output_path = args[3]

    # read and process training data
    data = load_user_data(data_path)
    data.reset_index(inplace=True)
    feature_names = data.columns

    # drop uuid column, the timestamps and the label source
    if "label_source" in data.columns:
        data = data.drop(["label_source"], axis=1)

    # drop uuid column, the timestamps and the label source
    X = data.drop(['level_0', 'level_1', 'timestamp'], axis=1)

    clf = load(classifier_path)
    y_pred = clf.predict(X)

    df = pd.DataFrame(y_pred, columns=clf.label_names)
    df.to_csv(output_path)
コード例 #3
0
import xgboost as xgb
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.pipeline import Pipeline

from data_handling import load_user_data, split_features_labels


if __name__ == '__main__':
    # load data and reset index
    user_data = load_user_data()
    features_df, labels_df = split_features_labels(user_data)
    del user_data
    features_df.reset_index(inplace=True)

    # get uuid column and remove them, the timestamps and the label_source from
    # the labels
    y = features_df.iloc[:, 0].values
    index_cols = features_df.columns[[0, 1, 2, -1]]
    features_df.drop(index_cols, axis=1, inplace=True)
    feature_names = features_df.columns
    X = features_df

    # train-test split and wrap output in data frame to save column names
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=41)