Ejemplo n.º 1
0
def compute_logloss(df_filenames, df_data):
    #STEP 1: replace values
    replacer = lambda x: max(float(min(x, 0.999999999999)), 0.0000000000000001)
    df_data = df_data.applymap(replacer)

    #STEP 2: rescale
    df_subsum = df_data.sum(axis=1)
    df_sum = pd.concat([
        df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum,
        df_subsum, df_subsum, df_subsum, df_subsum
    ],
                       axis=1)
    df_sum.columns = [
        'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9'
    ]
    df_data = df_data / df_sum

    #STEP 3: logloss
    #load correct validationset labels
    labels = Input.load_validationset_labels()
    df_labels = pd.get_dummies(
        labels)  #to one-hot-encoding, DataFrame automatically
    df_labels.columns = [
        'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9'
    ]

    #sort data to have same order as labels
    correct_order = Input.load_validationset_filenames()
    current_order = list(df_filenames.values)
    indices = [current_order.index(filename) for filename in correct_order]
    df_data = df_data.reindex(indices)
    df_data = df_data.reset_index(
    )  #reset index --> adds new indices, old indices become column 'index'
    df_data = df_data.drop('index', axis=1)  #remove this new column 'index'

    #select probabilities of correct classes only
    df_sparse_probs = df_data * df_labels
    probs = df_sparse_probs.values
    probs = list(chain.from_iterable(probs))  #flatten list
    probs = filter(lambda x: x != 0, probs)  #remove all zeros

    #apply log to them and take the average
    log_probs = [math.log(p) for p in probs]
    return -(np.mean(log_probs))
Ejemplo n.º 2
0
validationset_filenames = Input.load_validationset_filenames()
traindata_filenames = Input.load_traindata_filenames()
testset_filenames = Input.load_testdata_filenames()

#Load the features
feat = pd.read_csv('skinTrainFeatures.csv', index_col = 0)

#Select the features for each dataset
x_trainset = feat.ix[trainset_filenames]
x_validationset = feat.ix[validationset_filenames]  
x_testset = feat.ix[testset_filenames]  
x_traindata = feat.ix[traindata_filenames]

#Load the labels for each dataset
y_trainset = np.asarray(Input.load_trainset_labels())
y_validationset = np.asarray(Input.load_validationset_labels())
y_traindata = np.asarray(Input.load_traindata_labels())

#restructure the features so they can be used in the SVM
x_trainset = x_trainset.groupby(x_trainset.index).apply(transformXY)
x_validationset = x_validationset.groupby(x_validationset.index).apply(transformXY)
x_testset = x_testset.groupby(x_testset.index).apply(transformXY)
x_traindata = x_traindata.groupby(x_traindata.index).apply(transformXY)

#Normalise the data
df = x_traindata.iloc[:,1:]
df_norm = (df - df.mean(axis=1)) / (df.max(axis=1) - df.min(axis=1))
x_traindata = df_norm

#Train classifier
clf = OneVsRestClassifier(SVC(C=0.1,kernel='poly', probability=True))