Example #1
0
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
import re
from astropy.io.fits.header import Header
from matplotlib.pyplot import xlim
import sys
sys.path.append(
    '/home/peng/git/Machine_learning_for_reliability_analysis/Preprocess')
import Preprocessdata
import RFclass

## default setting
data_path = '../Data/'
p = Preprocessdata.standardprocess()


# class
class MyClass(object):
    def __init__(self, Training_data):

        df = pd.read_csv(data_path + Training_data, header=0)
        self.train, self.trainlabel, self.test, self.testlabel = p.noscale(
            df, 1.0)
        print('This dataset contains %r samples with %r features' %
              (np.shape(self.train)[0], np.shape(self.train)[1]))
        print('These features are:')
        for i in xrange(0, np.shape(self.train)[1]):
            print('Feature %r : %r' % (i + 1, list(df.columns.values)[i]))
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
import re
from astropy.io.fits.header import Header
from evolutionary_search import EvolutionaryAlgorithmSearchCV
from sklearn.cross_validation import StratifiedKFold
import random


start = timeit.default_timer()
df =pd.read_csv('Source_Data.csv', header=0)

p= Preprocessdata.standardprocess()

  
train, trainlabel, test, testlabel = p.scaledivd(df, 1.0)
print np.shape(train)


C_range=np.logspace(-10, 10, num=21, base=2,endpoint= True)
gamma_range=np.logspace(-10, 10, num=21, base=2,endpoint= True)

#######################################################################

paramgrid = {"kernel":["poly"],
             "C":C_range,
             "gamma":gamma_range,
             "degree":[3]            
Example #3
0
def main():
    start = timeit.default_timer()

    df = pd.read_csv('/home/peng/new160half.csv', header=0)
    #   df['random_number']=np.random.random(size = 160)
    #  df_sort = df.sort(columns='random_number')
    #    df_sort.drop(['random_number'], inplace = True, axis = 1)
    #    df_sort.to_csv('new_random_160.csv', header = 0)

    p = Preprocessdata.standardprocess()
    # #    df_2 = pd.read_csv('/home/peng/git/Machine_learning_for_reliability_analysis//Test_1/score_long_2features_rf.csv', header=0)
    #------------------------------------------------------------------------------
    train, trainlabel, test, testlabel = p.noscale(df, 0.9)
    #    train, trainlabel = p.noaction(df)
    # #    df_2 = pd.read_csv('/home/peng/git/Machine_learning_for_reliability_analysis/score50-500_2.csv', header=0)

    ff = RFclass.training()
    tt = RFclass.test()
    feature_range = np.arange(12, 13, 1)
    tree_range = np.arange(700, 701, 1)

    #####################sensitivity for 10cv##############
    #---------------------------------------------------------------- score = []
    #------------------------------------------------------ for i in range(100):
    # score.append(ff.trainman_sensitivity_CV('adb', train, trainlabel, tree_range, feature_range))
    #-------------------------------------------------------------- #    print score
    #- df_raw_times = pd.DataFrame({'times':np.arange(1,101,1), 'scores':score})
    #------------------------------------------------------------------------------
    #----------------------- df_raw_times = ff.str_float(df_raw_times['scores'])
    #------------------------------------------------------------------------------
    #---------------------------------- df_acc_times = ff.accuracy(df_raw_times)
    #-- df_acc_times.to_csv('adb_acc_10cv_f12_t700_100times.csv', header = True)
    """Just separate"""
    #===========================================================================
    # forest= ff.trainforest('ext', train, trainlabel,1900,9)
    # y_pred = forest.predict(test)
    # print metrics.precision_score(testlabel,y_pred)
    # cm = metrics.confusion_matrix(testlabel, y_pred)
    # tt.plot_confusion_matrix(cm)
    #===========================================================================
    """the CART single tree"""

    forest = ff.trainforest('cart', train, trainlabel, 20, 1)
    y_pred = forest.predict(test)
    print metrics.accuracy_score(testlabel, y_pred)
    print metrics.precision_score(testlabel, y_pred)
    cm = metrics.confusion_matrix(testlabel, y_pred)
    tt.plot_confusion_matrix(cm)

    #---------------------------------------------------------------- score = []
    #------------------------------------------------------ for i in range(100):
    #----------- forest = ff.trainforest('adb', train, trainlabel, 1450, 11)
    #----------------------------------------- y_pred = forest.predict(test)
    #--------------- score.append(metrics.accuracy_score(testlabel, y_pred))
    #------------------------------------------------------------------------------
    #--------- df=pd.DataFrame({'times': np.arange(1,101,1), 'acc_score':score})
    #--------------- df.to_csv('adb_acc_63_f12_t1450_100times.csv', header=True)
    #------------------------------------------------------- print df.describe()
    #------------------------------------------------------------------------------
    #------------------------------------------------------------------------------
    #------------------------------------------------------------------------------
    # df = pd.read_csv('/home/peng/git/Machine_learning_for_reliability_analysis/Test_1/adb_acc_63_f12_t1450_100times.csv', header=0)
    #------------------------------------ plt.plot(df['times'], df['acc_score'])
    #   plt.xticks(np.arange(1,101,1),np.arange(1,101,1))
    #    plt.xlabel()
    plt.show()

    ############################################################################################################################

    #------------------------------------- df_66_33 = {'tree_tange': tree_range}
    #---------------------------------------------- df_all = DataFrame(df_66_33)

    #   scores = ff.trainonlyfeat('bag', train, trainlabel, tree_range, feature_range)
    #    scores.to_csv('bag_100_4000_10times.csv', header=True)

    # data = ff.train_repeat_forest_metrics('bag', train, trainlabel, test, testlabel, tree_range, feature_range, 10)
    #---------------- data.to_csv('nnnnnn_crazy66_33_100_4000.csv', header=True)
    #------------------------------------------------------------------------------
    # data = ff.train_repeat_forest_metrics('adb', train, trainlabel, test, testlabel, tree_range, feature_range, 10)
    #------------ data.to_csv('nnnnnnnnnn_crazy66_33_100_4000.csv', header=True)

    # data = ff.train_repeat_forest_metrics('gbt', train, trainlabel, test, testlabel, tree_range, feature_range, 10)
    #------------------- data.to_csv('gbt_crazy66_33_100_4000.csv', header=True)

    #-- data = ff.trainmanCV('rf', train, trainlabel, tree_range, feature_range)
    #-------------------------- data.to_csv('rf_crazy100_4000.csv', header=True)
    #------------------------------------------------------------------------------
    #- data = ff.trainmanCV('ext', train, trainlabel, tree_range, feature_range)
    #------------------------- data.to_csv('ext_crazy100_4000.csv', header=True)

    #- data = ff.trainmanCV('bag', train, trainlabel, tree_range, feature_range)
    #------------------------ data.to_csv('bag_crazy100_4000n.csv', header=True)
    #------------------------------------------------------------------------------
    #- data = ff.trainmanCV('adb', train, trainlabel, tree_range, feature_range)
    #------------------------ data.to_csv('adb_crazy100_4000n.csv', header=True)

    #- data = ff.trainmanCV('gbt', train, trainlabel, tree_range, feature_range)
    #------------------------ data.to_csv('gbt_crazy100_4000n.csv', header=True)

    #    scores.to_csv('rf_1_5_1_feature4.csv', header=False)

    #    print scores
    stop = timeit.default_timer()
    print "The running takes %r min" % ((stop - start) / 60)