コード例 #1
0
ファイル: train.py プロジェクト: kailex/kaggle-galaxy
def assess(estimator,X,y):
    predictions = estimator.predict(X)
    #print predictions

    #predictions[:,:3] /=2

    scaler1 = joblib.load(galaxy.get_data_folder()+"/scaler1")
    predictions = scaler1.inverse_transform(predictions)
    #predictions *= scale
    
    predictions[predictions<1e-7] = 0

    #y[:,:3] /=2
    y = scaler1.inverse_transform(y)
    #y *= scale
    MSE = (predictions - y)**2
    mse = np.mean(MSE,axis=1)
    rmse = math.sqrt(mse.mean())
    rmse2 = np.sqrt(mse)

    indices = np.argsort(mse)[::-1]
    
    for i in indices[:5]:
        if rmse2[i]>0.1:
            print rmse2[i], int(X[i,0])
        
    return rmse
コード例 #2
0
    folder = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/images_training_rev1/"
else:
    folder = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/images_test_rev1/"

f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv"

files = glob(folder + "/*")

print "will predict", len(files), "galaxies"
print "from", folder

points = Parallel(n_jobs=-1)(delayed(galaxy.get_features)(f) for f in files)

mapping = galaxy.get_fieldnames()

forest = joblib.load(galaxy.get_data_folder() + "/galaxy_forest")
forest.set_params(n_jobs=1)

scaler2 = joblib.load(galaxy.get_data_folder() + "/scaler2")
points = scaler2.transform(points)

predictions = forest.predict(points)

scaler1 = joblib.load(galaxy.get_data_folder() + "/scaler1")
predictions = scaler1.inverse_transform(predictions)

# sparsify
# min on training responses: 2.9099999e-06
predictions[predictions < 1e-7] = 0

#f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv"
コード例 #3
0
        print "will learn from", len(selected_ids), "galaxies for Class", Class,c,svm_class

        for transform in [0]:#xrange(3):
            points = Parallel(n_jobs=-1)(delayed(process_galaxy)(galaxy_id,transform=transform)
                                         for galaxy_id in selected_ids )
        X.extend(points)
        Y.extend([svm_class for i in xrange(len(points))])
        svm_class += 1

    X = np.array(X,dtype='float')
    Y = np.array(Y,dtype='int')

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    joblib.dump( scaler, galaxy.get_data_folder()+"/scaler_statistics_Class"+ str(Class)+"_")
        
    print "got",len(X),"points"
    print np.bincount(Y,minlength=svm_class)             

    # train best SVM
    clf = SVC( kernel='rbf',
               class_weight='auto',
               probability=True,
               C=10.0,
               gamma=0.005 )
    clf.fit(X, Y)
    joblib.dump(clf, galaxy.get_data_folder()+"/svm_statistics_Class"+ str(Class)+"_")

exit(0)
コード例 #4
0
        for transform in [0]:  #xrange(3):
            points = Parallel(n_jobs=-1)(
                delayed(process_galaxy)(galaxy_id, transform=transform)
                for galaxy_id in selected_ids)
        X.extend(points)
        Y.extend([svm_class for i in xrange(len(points))])
        svm_class += 1

    X = np.array(X, dtype='float')
    Y = np.array(Y, dtype='int')

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    joblib.dump(
        scaler,
        galaxy.get_data_folder() + "/scaler_statistics_Class" + str(Class) +
        "_")

    print "got", len(X), "points"
    print np.bincount(Y, minlength=svm_class)

    # train best SVM
    clf = SVC(kernel='rbf',
              class_weight='auto',
              probability=True,
              C=10.0,
              gamma=0.005)
    clf.fit(X, Y)
    joblib.dump(
        clf,
        galaxy.get_data_folder() + "/svm_statistics_Class" + str(Class) + "_")
コード例 #5
0
ファイル: predict.py プロジェクト: kailex/kaggle-galaxy
else:
    folder = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/images_test_rev1/"

f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv"

files = glob(folder+"/*")

print "will predict", len(files), "galaxies" 
print "from", folder

points = Parallel(n_jobs=-1)(delayed(galaxy.get_features)(f)
                             for f in files )

mapping = galaxy.get_fieldnames()

forest = joblib.load(galaxy.get_data_folder()+"/galaxy_forest")
forest.set_params(n_jobs=1)

scaler2 = joblib.load(galaxy.get_data_folder()+"/scaler2")
points = scaler2.transform(points)

predictions = forest.predict( points )

scaler1 = joblib.load(galaxy.get_data_folder()+"/scaler1")
predictions = scaler1.inverse_transform(predictions)

# sparsify
# min on training responses: 2.9099999e-06
predictions[predictions<1e-7] = 0

#f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv"
コード例 #6
0
ファイル: train.py プロジェクト: kailex/kaggle-galaxy
    return rmse

def shuffle(a, b):
    """
    http://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison
    """
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv"
responses, ids = galaxy.read_responses( f )

scaler1 = StandardScaler()
responses = scaler1.fit_transform(responses)
joblib.dump( scaler1, galaxy.get_data_folder()+"/scaler1" )

# mapping = galaxy.get_classes()
# selection = {}
# for Class in xrange(1,12):
#     classes = np.nonzero(mapping==Class)[0]    
#     for c in classes:
#         q = 0.95
#         threshold = mquantiles( responses[:,c], q )        
#         tmp_selection = np.nonzero(responses[:,c]>=threshold)[0]
#         for i in tmp_selection:
#             selection[i] = 1

# tmp_responses = []
# tmp_ids = []
# for i in selection.keys():
コード例 #7
0
ファイル: train.py プロジェクト: ruchirgarg05/kaggle-galaxy
def shuffle(a, b):
    """
    http://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison
    """
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]


f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv"
responses, ids = galaxy.read_responses(f)

scaler1 = StandardScaler()
responses = scaler1.fit_transform(responses)
joblib.dump(scaler1, galaxy.get_data_folder() + "/scaler1")

# mapping = galaxy.get_classes()
# selection = {}
# for Class in xrange(1,12):
#     classes = np.nonzero(mapping==Class)[0]
#     for c in classes:
#         q = 0.95
#         threshold = mquantiles( responses[:,c], q )
#         tmp_selection = np.nonzero(responses[:,c]>=threshold)[0]
#         for i in tmp_selection:
#             selection[i] = 1

# tmp_responses = []
# tmp_ids = []
# for i in selection.keys():
コード例 #8
0
rng = RandomState(0)

###############################################################################
def plot_gallery(title, images, n_col=n_col, n_row=n_row):
    pl.figure(figsize=(2. * n_col, 2.26 * n_row))
    pl.suptitle(title, size=16)
    for i, comp in enumerate(images):
        pl.subplot(n_row, n_col, i + 1)
        vmax = max(comp.max(), -comp.min())
        pl.imshow(comp.reshape(image_shape), cmap=pl.cm.gray,
                  interpolation='nearest',
                  vmin=-vmax, vmax=vmax)
        pl.xticks(())
        pl.yticks(())
    pl.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)
    pl.savefig(title+'.png')
    
###############################################################################
# Plot a sample of the input data

for i in xrange(1,12):
    #pca = joblib.load(galaxy.get_data_folder()+"/pca_"+str(i)+"_")
    pca = joblib.load(galaxy.get_data_folder()+"/pca_"+thumbnail+"_Class"+ str(i)+"_")
    print pca.components_.shape
    print pca.explained_variance_ratio_
    plot_gallery("Class_"+thumbnail+str(i), pca.components_[:n_components])

    cv2.imwrite("mean_Class_"+thumbnail+str(i)+".png",np.reshape(pca.mean_,image_shape))
    
#pl.show()
コード例 #9
0
    n_components = 50

    print "Extracting the top %d eigenfaces from %d faces" % (n_components, X.shape[0])
    t0 = time()
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X)
    print "done in %0.3fs" % (time() - t0)

    print pca.explained_variance_ratio_

    print "Projecting the input data on the eigenfaces orthonormal basis"
    t0 = time()
    X_pca = pca.transform(X)
    print "done in %0.3fs" % (time() - t0)

    # save PCA
    joblib.dump(pca, galaxy.get_data_folder()+"/pca_color_Class"+ str(Class)+"_")

    # train best SVM
    clf = SVC( kernel='rbf',
               class_weight='auto',
               probability=True,
               C=5000.0,
               gamma=0.0001 )
    clf.fit(X_pca, Y)
    joblib.dump(clf, galaxy.get_data_folder()+"/pca_color_SVM_Class" + str(Class)+"_")

exit(0)

###############################################################################
# Train a SVM classification model
コード例 #10
0
ファイル: learn_pca_SVM.py プロジェクト: kailex/kaggle-galaxy
    n_components = 50

    print "Extracting the top %d eigenfaces from %d faces" % (n_components, X.shape[0])
    t0 = time()
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X)
    print "done in %0.3fs" % (time() - t0)

    print pca.explained_variance_ratio_

    print "Projecting the input data on the eigenfaces orthonormal basis"
    t0 = time()
    X_pca = pca.transform(X)
    print "done in %0.3fs" % (time() - t0)

    # save PCA
    joblib.dump(pca, galaxy.get_data_folder()+"/pca_"+thumbnail+"_Class"+ str(Class)+"_")

    # train best SVM
    clf = SVC( kernel='rbf',
               class_weight='auto',
               probability=True,
               C=50000.0,
               gamma=5e-05 )
    clf.fit(X_pca, Y)
    joblib.dump(clf, galaxy.get_data_folder()+"/pca_"+thumbnail+"_SVM_Class" + str(Class)+"_")

exit(0)

###############################################################################
# Train a SVM classification model
コード例 #11
0
                                                              X.shape[0])
    t0 = time()
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X)
    print "done in %0.3fs" % (time() - t0)

    print pca.explained_variance_ratio_

    print "Projecting the input data on the eigenfaces orthonormal basis"
    t0 = time()
    X_pca = pca.transform(X)
    print "done in %0.3fs" % (time() - t0)

    # save PCA
    joblib.dump(
        pca,
        galaxy.get_data_folder() + "/pca_color_Class" + str(Class) + "_")

    # train best SVM
    clf = SVC(kernel='rbf',
              class_weight='auto',
              probability=True,
              C=5000.0,
              gamma=0.0001)
    clf.fit(X_pca, Y)
    joblib.dump(
        clf,
        galaxy.get_data_folder() + "/pca_color_SVM_Class" + str(Class) + "_")

exit(0)

###############################################################################