Ejemplo n.º 1
0
# Separate features and output + scale data
# pandas has some weird column counting
scaler = StandardScaler()
multiBinarizer = MultiLabelBinarizer()

trainData = scaler.fit_transform(processedData[:, 0:2])

# Convert output to binarized array
numbers = np.reshape((processedData[:, 2]), (len(processedData[:, 2]), 1))
predOutput = multiBinarizer.fit_transform(numbers)

X_train, X_test, y_train, y_test = train_test_split(trainData, predOutput, test_size=TEST_SIZE_SAMPLE,
                                                    random_state=RANDOM_STATE_SPLIT)

# Try to load classifier from file
clf = input_output.load_classifier("sanfCrimeCLF.pkl")
if not clf:
    # If no file is present, train the classifier using the best known parameters and save the classifier
    print("There is no saved classifier!")
    print("Training Model...")
    clf = KNeighborsClassifier(n_neighbors=26, weights='distance', algorithm='kd_tree')
    clf.fit(X_train, y_train)
    input_output.save_classifier("sanfCrimeCLF.pkl", clf)
    print("Training Complete!")

# # Choose scoring metric
scores = ['f1']

# print(X_train)

# Find optimal model
Ejemplo n.º 2
0
# Define constants
TEST_SIZE_SAMPLE = 0.25
RANDOM_STATE_SPLIT = 40

rawData = input_output.load_data("train.csv")
processedData = preprocess.prepare_data(rawData)

# Separate features and output + scale data
trainData = (processedData[:, 1:])
predOutput = processedData[:, 0]

X_train, X_test, y_train, y_test = train_test_split(trainData, predOutput, test_size=TEST_SIZE_SAMPLE,
                                                    random_state=RANDOM_STATE_SPLIT)

# Try to load classifier from file
clf = input_output.load_classifier("titanicCLF.pkl")
if not clf:
    # If no file is present, train the classifier using the best known parameters and save the classifier
    print("There is no saved classifier!")
    print("Training Model...")
    clf = svm.SVC(C=1, kernel="rbf", gamma=0.1)
    clf.fit(X_train, y_train)
    input_output.save_classifier("titanicCLF.pkl", clf)
    print("Training Complete!")

# Choose scoring metrics
scores = ['precision', 'recall']

# # SEARCH OPTIMAL PARAMETERS FOR SVM
# svmModel = models.SVMModel(scores, X_train, y_train, y_test, X_test)
# param_grid = svmModel.grid_search_setup()