Ejemplo n.º 1
0
    #print "start"
    train_data = pd.read_csv(datapath, header=None, index_col=None)
    X = np.array(train_data)
    Y = list(map(lambda x: 1, xrange(len(train_data) // 2)))
    Y2 = list(map(lambda x: 0, xrange(len(train_data) // 2)))
    Y.extend(Y2)
    Y = np.array(Y)
    svc = svm.SVC(probability=True)
    parameters = {'kernel': ['rbf'], 'C': [math.pow(2,e) for e in range(-5,15,2)], 'gamma': [math.pow(2,e) for e in range(-15, -5, 2)]}
    #parameters = {'kernel': ['rbf'], 'C':map(lambda x:2**x,np.linspace(-2,5,7)), 'gamma':map(lambda x:2**x,np.linspace(-5,2,7))}
    clf = GridSearchCV(svc, parameters, cv=crossvalidation_values, n_jobs=CPU_values, scoring='accuracy')
    clf.fit(X, Y)
    C=clf.best_params_['C']
    gamma=clf.best_params_['gamma']
    y_predict=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma),X,Y,cv=crossvalidation_values,n_jobs=CPU_values)
    y_predict_prob=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma,probability=True),X,Y,cv=crossvalidation_values,n_jobs=CPU_values,method='predict_proba')
    joblib.dump(clf,path+classifier+mode+outputname+".model")
    predict_save=[Y.astype(int),y_predict.astype(int),y_predict_prob[:,1]]
    predict_save=np.array(predict_save).T
    pd.DataFrame(predict_save).to_csv('Before_'+path+classifier+mode+outputname+"_"+'_predict_crossvalidation.csv',header=None,index=False)
    ROC_AUC_area=metrics.roc_auc_score(Y,y_predict_prob[:,1])
    ACC=metrics.accuracy_score(Y,y_predict)
    precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, y_predict)
    F1_Score=metrics.f1_score(Y, y_predict)
    F_measure=F1_Score
    MCC=metrics.matthews_corrcoef(Y, y_predict)
    pos=TP+FN
    neg=FP+TN
    savedata=[[['SVM'+"C:"+str(C)+"gamma:"+str(gamma),ACC,precision, recall,SN, SP, GM,F_measure,F1_Score,MCC,ROC_AUC_area,TP,FN,FP,TN,pos,neg]]]
    easy_excel.save(classifier+"_crossvalidation",[str(X.shape[1])],savedata,path+'cross_validation_'+classifier+"_"+outputname+'.xls')
Ejemplo n.º 2
0
        F_measure = F1_Score
        MCC = metrics.matthews_corrcoef(Y, y_predict)
        pos = TP + FN
        neg = FP + TN
        savedata = [[[
            'xgboost' + "n_estimators:" + str(n_estimators) + "max_depth:" +
            str(max_depth) + "learning_rate:" + str(learning_rate), ACC,
            precision, recall, SN, SP, GM, F_measure, F1_Score, MCC,
            ROC_AUC_area, TP, FN, FP, TN, pos, neg
        ]]]
        if ACC > bestACC:
            bestACC = ACC
            bestn_estimators = n_estimators
            bestlearning_rate = learning_rate
            best_savedata = savedata
            bestmax_depth = max_depth
            best_dimension = X.shape[1]
        print savedata
        print X.shape[1]
        with open(classifier + mode + "all_dimension_results.txt", 'a') as f:
            f.write(str(savedata) + "\n")
        all_dimension_results.append(savedata)
    print bestACC
    print bestn_estimators
    print bestlearning_rate
    print bestmax_depth
    print best_dimension
    easy_excel.save(
        "xgboost_jackknife", [str(best_dimension)], best_savedata,
        path + classifier + mode + 'jackknife_' + outputname + '.xls')
Ejemplo n.º 3
0
        TN_all = TN_all + TN
        FP_all = FP_all + FP
        FN_all = FN_all + FN
        F_measure_all = F_measure_all + F_measure
        F1_Score_all = F1_Score_all + F1_Score
        pos_all = pos_all + pos
        neg_all = neg_all + neg
        MCC_all = MCC_all + MCC
    all_y = [
        np.array(Y_all).astype(int),
        np.array(y_pred_all).astype(int),
        np.array(y_pred_prob_all).astype(list)[:, 1]
    ]
    pd.DataFrame(np.matrix(all_y).T).to_csv(path + outputname + "_predict.csv",
                                            header=None,
                                            index=False)
    fpr, tpr, thresholds = roc_curve(
        np.array(Y_all).T, list(np.array(y_pred_prob_all).astype(list)[:, 1]))
    roc_auc = auc(fpr, tpr)
    savedata = [[[
        'svm' + "C:" + str(C) + "gamma:" + str(gamma), ACC_all / divided_num,
        precision_all / divided_num, recall_all / divided_num,
        SN_all / divided_num, SP_all / divided_num, GM_all / divided_num,
        F_measure_all / divided_num, F1_Score_all / divided_num,
        MCC_all / divided_num, roc_auc, TP_all, FN_all, FP_all, TN_all,
        pos_all, neg_all
    ]]]
    print savedata
    easy_excel.save("svm_independent_test", [str(X_train.shape[1])], savedata,
                    path + outputname + '.xls')
		predict_save=np.array(predict_save).T
		pd.DataFrame(predict_save).to_csv(path+classifier+mode+outputname+"_"+'_predict_crossvalidation.csv',header=None,index=False)
		ROC_AUC_area=metrics.roc_auc_score(Y,y_predict)
		ACC=metrics.accuracy_score(Y,y_predict)
		precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, y_predict)
		F1_Score=metrics.f1_score(Y, y_predict)
		F_measure=F1_Score
		MCC=metrics.matthews_corrcoef(Y, y_predict)
		pos=TP+FN
		neg=FP+TN
		savedata=[[['xgboost'+"n_estimators:"+str(n_estimators)+"max_depth:"+str(max_depth)+"learning_rate:"+str(learning_rate),ACC,precision, recall,SN, SP, GM,F_measure,F1_Score,MCC,ROC_AUC_area,TP,FN,FP,TN,pos,neg]]]
		if ACC>bestACC:
			bestACC=ACC
			bestn_estimators=n_estimators
			bestlearning_rate=learning_rate
			best_savedata=savedata
			bestmax_depth=max_depth
			best_dimension=X.shape[1]
		print savedata
		print X.shape[1]
		with open(classifier+mode+"all_dimension_results.txt",'a') as f:
			f.write(str(savedata)+"\n")
		all_dimension_results.append(savedata)
	print bestACC
	print bestn_estimators
	print bestlearning_rate
	print bestmax_depth
	print best_dimension
	easy_excel.save("xgboost_crossvalidation",[str(best_dimension)],best_savedata,path+classifier+mode+'cross_validation_'+name+'.xls')

Ejemplo n.º 5
0
def SVM_distance(inputname,outputname,distance,crossvalidation_values,CPU_values,SVM_distance_results):
    datapath =inputname
    classifier="SVM"
    mode="crossvalidation"
    print "start"
    train_data = pd.read_csv(datapath, header=None, index_col=None)
    print len(train_data)
    Y = list(map(lambda x: 1, xrange(len(train_data) // 2)))
    Y2 = list(map(lambda x: 0, xrange(len(train_data) // 2)))
    Y.extend(Y2)
    Y = np.array(Y)
    F, pval = f_classif(train_data, Y)
    idx = np.argsort(F)
    selected_list_=idx[::-1]
    F_sort_value=[F[e] for e in selected_list_]
    with open(SVM_distance_results+outputname+"all_dimension_results.txt",'w') as f:
            f.write(str(F_sort_value)+"\n")
    with open(SVM_distance_results+outputname+"all_dimension_results.txt",'a') as f:
            f.write(str(selected_list_)+"\n")
    
    print "deal with data"
    selected_list_=[a  for a,b in zip(selected_list_,F_sort_value) if not math.isnan(b)]
    with open(SVM_distance_results+outputname+"all_dimension_results.txt",'a') as f:
            f.write(str(selected_list_)+"\n")
    
    bestACC=0
    best_c=0
    best_g=0
    best_dimension=0
    all_dimension_results=[]
    select_list=[]
    best_savedata=""
    select_num1=0;
    for select_num in range(0,len(selected_list_),distance):
        if select_num > 0:
           for select_num1 in range(select_num-distance+1,select_num+1):  
               temp_data=selected_list_[select_num1]
               select_list.append(int(temp_data))
               train_data2=train_data.values
               X_train=pd.DataFrame(train_data2)
               X_train=X_train.iloc[:,select_list]
               X = np.array(X_train)
        else:
            temp_data=selected_list_[select_num]
            select_list.append(int(temp_data))
            train_data2=train_data.values
            X_train=pd.DataFrame(train_data2)
            X_train=X_train.iloc[:,select_list]
            X = np.array(X_train)
        svc = svm.SVC(probability=True)
        parameters = {'kernel': ['rbf'], 'C':map(lambda x:2**x,np.linspace(-2,5,7)), 'gamma':map(lambda x:2**x,np.linspace(-5,2,7))}
        clf = GridSearchCV(svc, parameters, cv=crossvalidation_values, n_jobs=CPU_values, scoring='accuracy')
        clf.fit(X, Y)
        C=clf.best_params_['C']
        gamma=clf.best_params_['gamma']
       
        y_predict=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma),X,Y,cv=crossvalidation_values,n_jobs=CPU_values)
        y_predict_prob=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma,probability=True),X,Y,cv=crossvalidation_values,n_jobs=CPU_values,method='predict_proba')
        
        joblib.dump(clf,SVM_distance_results+outputname+"_"+classifier+mode+str(select_num+1)+".model")
        predict_save=[Y.astype(int),y_predict.astype(int),y_predict_prob[:,1]]
        predict_save=np.array(predict_save).T
        #pd.DataFrame(predict_save).to_csv('Before_'+path+classifier+mode+outputname+"_"+'_predict_crossvalidation.csv',header=None,index=False)
        ROC_AUC_area=metrics.roc_auc_score(Y,y_predict_prob[:,1])
        ACC=metrics.accuracy_score(Y,y_predict)
        precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, y_predict)
        F1_Score=metrics.f1_score(Y, y_predict)
        F_measure=F1_Score
        MCC=metrics.matthews_corrcoef(Y, y_predict)
        pos=TP+FN
        neg=FP+TN
        savedata=[[['SVM'+"C:"+str(C)+"gamma:"+str(gamma),ACC,precision, recall,SN, SP, GM,F_measure,F1_Score,MCC,ROC_AUC_area,TP,FN,FP,TN,pos,neg]]]
        if ACC>bestACC:
            bestACC=ACC
            best_c=C
            best_g=gamma
            best_savedata=savedata
            best_dimension=X.shape[1]
        print savedata
        print X.shape[1]
        with open(SVM_distance_results+outputname+"all_dimension_results.txt",'a') as f:
            f.write(str(savedata)+"\n")
        all_dimension_results.append(savedata)
    print bestACC
    print best_c
    print best_g
    print best_dimension
    y_predict1=cross_val_predict(svm.SVC(kernel='rbf',C=best_c,gamma=best_g),X,Y,cv=crossvalidation_values,n_jobs=CPU_values)
    y_predict_prob1=cross_val_predict(svm.SVC(kernel='rbf',C=best_c,gamma=best_g,probability=True),X,Y,cv=crossvalidation_values,n_jobs=CPU_values,method='predict_proba')
    predict_save1=[Y.astype(int),y_predict1.astype(int),y_predict_prob1[:,1]]
    predict_save1=np.array(predict_save1).T
    pd.DataFrame(predict_save1).to_csv(SVM_distance_results+outputname+"_"+classifier+mode+'_best_dim_pro_features.csv',header=None,index=False)
    easy_excel.save("SVM_crossvalidation",[str(best_dimension)],best_savedata,SVM_distance_results+outputname+"_"+classifier+mode+'_best_ACC.xls')
    return y_predict_prob1[:,1]
Ejemplo n.º 6
0
                                                     X,
                                                     Y,
                                                     cv=10,
                                                     n_jobs=-1).mean()
            ACC = metrics.accuracy_score(Y, y_predict)
            precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(
                Y, y_predict)
            F1_Score = metrics.f1_score(Y, y_predict)
            F_measure = F1_Score
            MCC = metrics.matthews_corrcoef(Y, y_predict)
            pos = TP + FN
            neg = FP + TN

            whole_result.append([[
                'svm' + "C:" + str(C) + "gamma:" + str(gamma), ACC, precision,
                recall, SN, SP, GM, F_measure, F1_Score, MCC, ROC_AUC_area, TP,
                FN, FP, TN, pos, neg
            ]])
            whole_dimension.append(str(X.shape[1]))
            print whole_result
    easy_excel.save("svm_crossvalidation", whole_dimension, whole_result,
                    'svm.xls')
# print RFH_PseDNC
# RFH_=pd.read_csv(sys.argv[1],header=None,index_col=None)
# RFH_=pd.DataFrame(RFH_).astype(float)
# PseDNC_=pd.read_csv(sys.argv[2],header=None,index_col=None)
# print len(PseDNC_.values[0])
# RFH_PseDNC=pd.concat([RFH_,PseDNC_],axis=1)
# pd.DataFrame(RFH_PseDNC).to_csv(sys.argv[3],header=None,index=False)
# print RFH_PseDNC
Ejemplo n.º 7
0
            print u'>>>', name, 'is training...searching best parms...'
            if isMultipleThread:
                new_thread = ClassifyThread(name, grid_search, X_train, y_train, test_x=X_test, test_y=y_test)
                new_thread.start()
                threads.append(new_thread)
            else:
                loop_classifier(name, grid_search, X_train, y_train, test_x=X_test, test_y=y_test)
        else:
            experiment = '交叉验证结果'
            print u'>>>', name, 'is cross validating...searching best parms...'
            if isMultipleThread:
                new_thread = ClassifyThread(name, grid_search, X, y, cv=cv)
                new_thread.start()
                threads.append(new_thread)
            else:
                loop_classifier(name, grid_search, X, y, cv=cv)
        print 'Time cost: ', clock() - sec
    # 等待所有线程完成
    for t in threads:
        t.join()
    dimensions.append(str(dimension))
    big_results.append(results)
print 'Time cost: ', clock() - sec

# 保存结果至Excel
print '====================='
if easy_excel.save(experiment, dimensions, big_results, excel_name):
    print 'Save excel result file successfully.'
else:
    print 'Failed. Please close excel result file first.'
Ejemplo n.º 8
0
    X_predict = clf.predict(X)
    X_predict_proba = clf.predict_proba(X)
    print X_predict_proba
    #print X_predict
    #print len(X_predict)
    pd.DataFrame(X_predict_proba[:,
                                 1]).to_csv(outputname1 + 'predict_proba.csv',
                                            header=None,
                                            index=False)
    ROC_AUC_area = metrics.roc_auc_score(Y, X_predict_proba[:, 1])
    ACC = metrics.accuracy_score(Y, X_predict)
    precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, X_predict)
    F1_Score = metrics.f1_score(Y, X_predict)
    F_measure = F1_Score
    MCC = metrics.matthews_corrcoef(Y, X_predict)
    pos = TP + FN
    neg = FP + TN
    C = clf.best_params_['C']
    gamma = clf.best_params_['gamma']
    print X.shape[1]
    print name1
    print ACC
    savedata = [[[
        'SVM' + "C:" + str(C) + "gamma:" + str(gamma), ACC, precision, recall,
        SN, SP, GM, F_measure, F1_Score, MCC, ROC_AUC_area, TP, FN, FP, TN,
        pos, neg
    ]]]
    easy_excel.save(
        "SVM_crossvalidation", [str(X.shape[1])], savedata,
        'SVM_crossvalidation' + name1 + '_Predict_' + name + '.xls')
Ejemplo n.º 9
0
            else:
                loop_classifier(name,
                                grid_search,
                                X_train,
                                y_train,
                                test_x=X_test,
                                test_y=y_test)
        else:
            experiment = '交叉验证结果'
            print u'>>>', name, 'is cross validating...searching best parms...'
            if isMultipleThread:
                new_thread = ClassifyThread(name, grid_search, X, y, cv=cv)
                new_thread.start()
                threads.append(new_thread)
            else:
                loop_classifier(name, grid_search, X, y, cv=cv)
        print 'Time cost: ', clock() - sec
    # 等待所有线程完成
    for t in threads:
        t.join()
    dimensions.append(str(dimension))
    big_results.append(results)
print 'Time cost: ', clock() - sec

# 保存结果至Excel
print '====================='
if easy_excel.save(experiment, dimensions, big_results, excel_name):
    print 'Save excel result file successfully.'
else:
    print 'Failed. Please close excel result file first.'
Ejemplo n.º 10
0
def SVM_calssfier(input_feature, proba_dir, model_dir, result_dir,
                  crossvalidation_values, CPU_values, output_name, t):
    #print(input_feature)
    #output_name = input_feature.split("\\")[-1].split(".")[0]
    #print(output_name)
    classifier = 'SVM'
    X = input_feature
    Y = list(map(lambda x: 1, xrange(len(input_feature) // 2)))
    Y2 = list(map(lambda x: 0, xrange(len(input_feature) // 2)))
    Y.extend(Y2)
    Y = np.array(Y)
    d = input_feature.shape[1]
    svc = svm.SVC(probability=True)
    parameters = {
        'kernel': ['rbf'],
        'C': map(lambda x: 2**x, np.linspace(-2, 5, 7)),
        'gamma': map(lambda x: 2**x, np.linspace(-5, 2, 7))
    }
    clf = GridSearchCV(svc,
                       parameters,
                       cv=crossvalidation_values,
                       n_jobs=CPU_values,
                       scoring='accuracy')
    clf.fit(X, Y)
    C = clf.best_params_['C']
    gamma = clf.best_params_['gamma']
    y_predict = cross_val_predict(svm.SVC(kernel='rbf', C=C, gamma=gamma),
                                  X,
                                  Y,
                                  cv=crossvalidation_values,
                                  n_jobs=CPU_values)
    y_predict_prob = cross_val_predict(svm.SVC(kernel='rbf',
                                               C=C,
                                               gamma=gamma,
                                               probability=True),
                                       X,
                                       Y,
                                       cv=crossvalidation_values,
                                       n_jobs=CPU_values,
                                       method='predict_proba')
    joblib.dump(clf, model_dir + "\\" + str(t) + "time.model")

    predict_save = [y_predict_prob[:, 0], y_predict_prob[:, 1]]
    predict_save = np.array(predict_save).T
    pd.DataFrame(predict_save).to_csv(proba_dir + str(t) + 'time.csv',
                                      header=None,
                                      index=False)
    ROC_AUC_area = metrics.roc_auc_score(Y, y_predict_prob[:, 1])
    ACC = metrics.accuracy_score(Y, y_predict)
    precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, y_predict)
    F1_Score = metrics.f1_score(Y, y_predict)
    F_measure = F1_Score
    MCC = metrics.matthews_corrcoef(Y, y_predict)
    pos = TP + FN
    neg = FP + TN
    savedata = [[[
        'SVM' + "C:" + str(C) + "gamma:" + str(gamma), ACC, precision, recall,
        SN, SP, GM, F_measure, F1_Score, MCC, ROC_AUC_area, TP, FN, FP, TN,
        pos, neg
    ]]]
    #print(savedata)
    #print(result_dir+output_name+'cross_validation_'+str(d)+'D_'+str(t)+'time.xls')
    easy_excel.save(
        classifier + "_crossvalidation", [str(X.shape[1])], savedata,
        result_dir + output_name + 'cross_validation_' + str(d) + 'D_' +
        str(t) + 'time.xls')
Ejemplo n.º 11
0
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=-1),
        AdaBoostClassifier(),
        LinearSVC(),
        GaussianNB()]

    # 导入原始数据
    second = clock()
    X, y = get_data(input_file)
    results = []
    print 'Time cost on loading data: ', clock() - second

    # 对数据切分或交叉验证,得出结果
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=split_rate, random_state=0)
    for name, model in zip(names, classifiers):
        if cv == 0:
            print u'>>>', name, 'is training...'
            out = loop_classifier(name, model, X_train, y_train, test_x=X_test, test_y=y_test)
        else:
            print u'>>>', name, 'is cross validating...'
            out = loop_classifier(name, model, X, y, cv=cv)
        if out is not None:
            results.append(out)

    # 保存结果至Excel
    print '====================='
    if easy_excel.save(str(X_train.shape[1]), results):
        print 'Save "results.xls" successfully.'
    else:
        print 'Fail to save "results.xls". Please close "results.xls" first.'