def train(train_x, train_y):
    forest = rf.training().trainforest('rf', train_x, train_y, 1000, accuracy_train_calculation = True)
    
    
    #===========================================================================
    importance = rf.training().importance(forest,np.shape(train_x)[1])
    
    feature_list = list(train_x.columns.values)
    
    print feature_list
    
    importance["keyword"] = importance["Ranking"].map(lambda x : feature_list[x])
    
    print importance
    
    
    
    # rf.training().dependence(forest, train_x, feature_set)
    # rf.training().dependence3d(forest, train_x, feature_set)
    #===========================================================================
    return forest
#--------------------------------------------------------- tl.set_color('b')
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#------------------------------------------------------------- ax2 = ax1.twinx()
#---------- ax2.plot(df['Tree'], pd.rolling_std(df['Score'], window = 10), 'r-')
#-------------------------------------- ax2.set_ylabel('Rolling std', color='r')
#---------------------------------------------- for tl in ax2.get_yticklabels():
#--------------------------------------------------------- tl.set_color('r')
#-------------------------------------------------------------------- plt.show()
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#------------------------------------------------- stop = timeit.default_timer()
#--------------------------- print "The running takes %r min" %((stop-start)/60)

ff = RFclass.training()
tt = RFclass.test()
pp = Superplot.fancy()
figsize(9.5, 7)

df1 = pd.read_csv(
    '//home/peng/git/Machine_learning_for_reliability_analysis/Test_1/Results/Ensemble/statistical_csv/bag_acc_10cv_100_4000.csv',
    header=0)
df2 = pd.read_csv(
    '//home/peng/git/Machine_learning_for_reliability_analysis/Test_1/Results/Ensemble/statistical_csv/bag_prec_10cv_100_4000.csv',
    header=0)
print "This is the training set of field data."
print "The size of the data is "
print df.describe()
plt.plot(df1['tree_range'], df1['12'], label='Accuracy')
plt.plot(df1['tree_range'], df2['12'], label='Precision')
def test(test_x, test_y, train_model):
    rf.test().testforest(test_x, test_y, train_model)
#------------------------------------------------------------------------------ 
#------------------------------------------------------------- ax2 = ax1.twinx()
#---------- ax2.plot(df['Tree'], pd.rolling_std(df['Score'], window = 10), 'r-')
#-------------------------------------- ax2.set_ylabel('Rolling std', color='r')
#---------------------------------------------- for tl in ax2.get_yticklabels():
    #--------------------------------------------------------- tl.set_color('r')
#-------------------------------------------------------------------- plt.show()
#------------------------------------------------------------------------------ 
#------------------------------------------------------------------------------ 
#------------------------------------------------------------------------------ 
#------------------------------------------------- stop = timeit.default_timer()
#--------------------------- print "The running takes %r min" %((stop-start)/60)

    
        
ff = RFclass.training()
tt = RFclass.test()
pp = Superplot.fancy()
figsize(9.5,7)

df1 = pd.read_csv('//home/peng/git/Machine_learning_for_reliability_analysis/Test_1/Results/Ensemble/statistical_csv/bag_acc_10cv_100_4000.csv', header=0)
df2 = pd.read_csv('//home/peng/git/Machine_learning_for_reliability_analysis/Test_1/Results/Ensemble/statistical_csv/bag_prec_10cv_100_4000.csv', header=0)
print "This is the training set of field data."
print "The size of the data is " 
print df.describe()
plt.plot(df1['tree_range'], df1['12'], label='Accuracy')
plt.plot(df1['tree_range'], df2['12'], label = 'Precision')
plt.legend(fontsize = 20)
plt.xticks(fontsize =20)
plt.yticks(fontsize =20)
plt.ylabel('Classification metrics', fontsize = 24)
 def training(self):
     bestmodel= RFC(n_estimators= 100).fit(self.train,self.trainlabel)   
     ff= RFclass.training()
     ff.importance(bestmodel, 12, color = '#66cdaa', plot_std = False)
     return bestmodel  
Esempio n. 6
0
 def training(self):
     bestmodel = RFC(n_estimators=100).fit(self.train, self.trainlabel)
     ff = RFclass.training()
     ff.importance(bestmodel, 12, color='#66cdaa', plot_std=False)
     return bestmodel
Esempio n. 7
0
def main():
    start = timeit.default_timer()

    df = pd.read_csv('/home/peng/new160half.csv', header=0)
    #   df['random_number']=np.random.random(size = 160)
    #  df_sort = df.sort(columns='random_number')
    #    df_sort.drop(['random_number'], inplace = True, axis = 1)
    #    df_sort.to_csv('new_random_160.csv', header = 0)

    p = Preprocessdata.standardprocess()
    # #    df_2 = pd.read_csv('/home/peng/git/Machine_learning_for_reliability_analysis//Test_1/score_long_2features_rf.csv', header=0)
    #------------------------------------------------------------------------------
    train, trainlabel, test, testlabel = p.noscale(df, 0.9)
    #    train, trainlabel = p.noaction(df)
    # #    df_2 = pd.read_csv('/home/peng/git/Machine_learning_for_reliability_analysis/score50-500_2.csv', header=0)

    ff = RFclass.training()
    tt = RFclass.test()
    feature_range = np.arange(12, 13, 1)
    tree_range = np.arange(700, 701, 1)

    #####################sensitivity for 10cv##############
    #---------------------------------------------------------------- score = []
    #------------------------------------------------------ for i in range(100):
    # score.append(ff.trainman_sensitivity_CV('adb', train, trainlabel, tree_range, feature_range))
    #-------------------------------------------------------------- #    print score
    #- df_raw_times = pd.DataFrame({'times':np.arange(1,101,1), 'scores':score})
    #------------------------------------------------------------------------------
    #----------------------- df_raw_times = ff.str_float(df_raw_times['scores'])
    #------------------------------------------------------------------------------
    #---------------------------------- df_acc_times = ff.accuracy(df_raw_times)
    #-- df_acc_times.to_csv('adb_acc_10cv_f12_t700_100times.csv', header = True)
    """Just separate"""
    #===========================================================================
    # forest= ff.trainforest('ext', train, trainlabel,1900,9)
    # y_pred = forest.predict(test)
    # print metrics.precision_score(testlabel,y_pred)
    # cm = metrics.confusion_matrix(testlabel, y_pred)
    # tt.plot_confusion_matrix(cm)
    #===========================================================================
    """the CART single tree"""

    forest = ff.trainforest('cart', train, trainlabel, 20, 1)
    y_pred = forest.predict(test)
    print metrics.accuracy_score(testlabel, y_pred)
    print metrics.precision_score(testlabel, y_pred)
    cm = metrics.confusion_matrix(testlabel, y_pred)
    tt.plot_confusion_matrix(cm)

    #---------------------------------------------------------------- score = []
    #------------------------------------------------------ for i in range(100):
    #----------- forest = ff.trainforest('adb', train, trainlabel, 1450, 11)
    #----------------------------------------- y_pred = forest.predict(test)
    #--------------- score.append(metrics.accuracy_score(testlabel, y_pred))
    #------------------------------------------------------------------------------
    #--------- df=pd.DataFrame({'times': np.arange(1,101,1), 'acc_score':score})
    #--------------- df.to_csv('adb_acc_63_f12_t1450_100times.csv', header=True)
    #------------------------------------------------------- print df.describe()
    #------------------------------------------------------------------------------
    #------------------------------------------------------------------------------
    #------------------------------------------------------------------------------
    # df = pd.read_csv('/home/peng/git/Machine_learning_for_reliability_analysis/Test_1/adb_acc_63_f12_t1450_100times.csv', header=0)
    #------------------------------------ plt.plot(df['times'], df['acc_score'])
    #   plt.xticks(np.arange(1,101,1),np.arange(1,101,1))
    #    plt.xlabel()
    plt.show()

    ############################################################################################################################

    #------------------------------------- df_66_33 = {'tree_tange': tree_range}
    #---------------------------------------------- df_all = DataFrame(df_66_33)

    #   scores = ff.trainonlyfeat('bag', train, trainlabel, tree_range, feature_range)
    #    scores.to_csv('bag_100_4000_10times.csv', header=True)

    # data = ff.train_repeat_forest_metrics('bag', train, trainlabel, test, testlabel, tree_range, feature_range, 10)
    #---------------- data.to_csv('nnnnnn_crazy66_33_100_4000.csv', header=True)
    #------------------------------------------------------------------------------
    # data = ff.train_repeat_forest_metrics('adb', train, trainlabel, test, testlabel, tree_range, feature_range, 10)
    #------------ data.to_csv('nnnnnnnnnn_crazy66_33_100_4000.csv', header=True)

    # data = ff.train_repeat_forest_metrics('gbt', train, trainlabel, test, testlabel, tree_range, feature_range, 10)
    #------------------- data.to_csv('gbt_crazy66_33_100_4000.csv', header=True)

    #-- data = ff.trainmanCV('rf', train, trainlabel, tree_range, feature_range)
    #-------------------------- data.to_csv('rf_crazy100_4000.csv', header=True)
    #------------------------------------------------------------------------------
    #- data = ff.trainmanCV('ext', train, trainlabel, tree_range, feature_range)
    #------------------------- data.to_csv('ext_crazy100_4000.csv', header=True)

    #- data = ff.trainmanCV('bag', train, trainlabel, tree_range, feature_range)
    #------------------------ data.to_csv('bag_crazy100_4000n.csv', header=True)
    #------------------------------------------------------------------------------
    #- data = ff.trainmanCV('adb', train, trainlabel, tree_range, feature_range)
    #------------------------ data.to_csv('adb_crazy100_4000n.csv', header=True)

    #- data = ff.trainmanCV('gbt', train, trainlabel, tree_range, feature_range)
    #------------------------ data.to_csv('gbt_crazy100_4000n.csv', header=True)

    #    scores.to_csv('rf_1_5_1_feature4.csv', header=False)

    #    print scores
    stop = timeit.default_timer()
    print "The running takes %r min" % ((stop - start) / 60)
Esempio n. 8
0
def testRF(test_x, test_y, forest):
    output_RF = rf.test().testforest(test_x, test_y, forest)
Esempio n. 9
0
def trainRF(train_x, train_y, n_estimator):
    forest = rf.training().trainforest('rf', train_x, train_y, n_estimator)
    importances = rf.training().importance(forest, n_estimator)
    
    return forest, importances