def compute_logloss(df_filenames, df_data): #STEP 1: replace values replacer = lambda x: max(float(min(x, 0.999999999999)), 0.0000000000000001) df_data = df_data.applymap(replacer) #STEP 2: rescale df_subsum = df_data.sum(axis=1) df_sum = pd.concat([ df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum ], axis=1) df_sum.columns = [ 'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9' ] df_data = df_data / df_sum #STEP 3: logloss #load correct validationset labels labels = Input.load_validationset_labels() df_labels = pd.get_dummies( labels) #to one-hot-encoding, DataFrame automatically df_labels.columns = [ 'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9' ] #sort data to have same order as labels correct_order = Input.load_validationset_filenames() current_order = list(df_filenames.values) indices = [current_order.index(filename) for filename in correct_order] df_data = df_data.reindex(indices) df_data = df_data.reset_index( ) #reset index --> adds new indices, old indices become column 'index' df_data = df_data.drop('index', axis=1) #remove this new column 'index' #select probabilities of correct classes only df_sparse_probs = df_data * df_labels probs = df_sparse_probs.values probs = list(chain.from_iterable(probs)) #flatten list probs = filter(lambda x: x != 0, probs) #remove all zeros #apply log to them and take the average log_probs = [math.log(p) for p in probs] return -(np.mean(log_probs))
from sklearn.svm import SVC from sklearn.svm import NuSVC import numpy as np from IO import Output import pickle from sklearn.svm import LinearSVC ''' Helper function to use with the grouping of the dataframe, turns 3 rows of coordinates into a single row ''' def transformXY(coords): return pd.Series(np.asarray(coords).ravel()) #Load the file names of the various datasets trainset_filenames = Input.load_trainset_filenames() validationset_filenames = Input.load_validationset_filenames() traindata_filenames = Input.load_traindata_filenames() testset_filenames = Input.load_testdata_filenames() #Load the features feat = pd.read_csv('skinTrainFeatures.csv', index_col = 0) #Select the features for each dataset x_trainset = feat.ix[trainset_filenames] x_validationset = feat.ix[validationset_filenames] x_testset = feat.ix[testset_filenames] x_traindata = feat.ix[traindata_filenames] #Load the labels for each dataset y_trainset = np.asarray(Input.load_trainset_labels()) y_validationset = np.asarray(Input.load_validationset_labels())