def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag): from sklearn import cluster import plotter clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=13) else: clusteringType = cluster.AgglomerativeClustering(n_clusters=13) print "Performing experiemnt # 3: Clustering score into two clusters " versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions # print "Sanitized versions that will be used in study ", len(sanitizedVersions) # print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions) # print "zzzz", len(NonZero_sanitizedVersionsWithScore) ### dyumping scores ... brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1] # print "lalalaa ", onlyTheNonZeroSanitizedVScores # strOfScoresToDump="" # for elem in onlyTheNonZeroSanitizedVScores: # strOfScoresToDump = strOfScoresToDump + str(elem) + "," + "\n" ### # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump) reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ if clusterFlag: centroids = clusteringType.cluster_centers_ print "And the centroids are .... ", centroids NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions ) ##### plotting clusters start # low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore) # low_cluster_x = [ 22.35294118 for x in low_cluster_y] # hig_cluster_x = [ 50.82030058 for x in high_cluster_y] # plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y) ##### plottign clusters end else: print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions ) print "And the labels are .... " print len(labelsFroVersions) cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores) silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels) print "Silhouette average---> ", silhouette_avg ############################## themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv" IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels, False)
def experiemnt_one(dbFileName, meanFlag, outputStrParam): print "Performing experiment # 1" #import correlation as corr_ versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions( versionAndCodeQualityDict, meanFlag) print "Sanitized versions that will be used in study ", len( sanitizedVersions) #print "Sanitized versions ..." , sanitizedVersions sanitizedVersionsWithScore = sanityCheck.getVulnerbailityScoreOfSelectedVersions( sanitizedVersions) ''' Stats on risk score-->len=721, median=51.1111111111, mean=38.0255199862, max=53.3333333333, min=0.0, ''' riskStatus = sanityCheck.getVulnerbailityScoreStatus( sanitizedVersionsWithScore) if meanFlag: threshold = riskStatus[0] ## first returned index is mean else: threshold = riskStatus[1] ############################## sanitizedVersions_CQ = sanitizedVersions ####### high vScore versions started high_CQ_dict = utility.getHighVScoreVersions_CQ(sanitizedVersionsWithScore, sanitizedVersions_CQ, threshold) high_vScore_Dict = utility.getHighVScoreVersions_VScore( sanitizedVersionsWithScore, threshold) print "high_vscore_versions ", len(high_vScore_Dict) ####### high vScore versions ended ####### low vScore versions started low_CQ_dict = utility.getLowVScoreVersions_CQ(sanitizedVersionsWithScore, sanitizedVersions_CQ, threshold) low_vScore_Dict = utility.getLowVScoreVersions_VScore( sanitizedVersionsWithScore, threshold) print "len_vscore_versions ", len(low_vScore_Dict) ####### low vScore versions ended ##### dumpin time ### three ways: first by dumping all highs then all lows themegaFile_Seperated = outputStrParam + "_" + "all-CQ-HL-Seperated.csv" IO_.dumpIntoFileByHighAndLow(themegaFile_Seperated, high_CQ_dict, low_CQ_dict) ### three ways : second by dumping as it si themegaFile_All = outputStrParam + "_" + "all-CQ-HL.csv" IO_.dumpIntoFile(themegaFile_All, sanitizedVersions_CQ, sanitizedVersionsWithScore, threshold, False) LGR.performLogiRegression(themegaFile_All)
def mobilesoft_cart(fileNameParam, fileToWriteP): testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam) trainData = testAndTrainData[0] testData = testAndTrainData[1] selected_training_data = pca_mobilesoft.getPCAedFeatures(trainData) print "Size of selected training data : ", np.shape(selected_training_data) print "=" * 50 dict_of_results = param_exp_classifier.runCART(selected_training_data, testData, 0.90) reportStr = param_exp_analysis.analyzeThis(dict_of_results) IO_.writeStrToFile(fileToWriteP, reportStr)
def speedup_random_forest(fileNameParam, fileToWriteP): testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam) trainData = testAndTrainData[0] testData = testAndTrainData[1] #print trainData selected_training_data = pca_mobilesoft.getPCAedFeatures(trainData) print "Size of selected training data : ", np.shape(selected_training_data) print "="*50 dict_of_results = runRandomForest(selected_training_data, testData) reportStr = param_exp_analysis.analyzeThis(dict_of_results) IO_.writeStrToFile(fileToWriteP, reportStr)
def experiemnt_two(dbFileName, meanFlag, outputStrParam ): print "Performing experiemnt # 2" #import correlation as corr_ versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag) print "Sanitized versions that will be used in study ", len(sanitizedVersions) #print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions) ''' Stats on risk score (non-zero elemnts)-->len=549, median=51.1111111111, mean=49.9387976503, max=53.3333333333, min=15.0 ''' ############################## sanitizedVersions_CQ = sanitizedVersions riskStatus = sanityCheck.getVulnerbailityScoreStatus(NonZero_sanitizedVersionsWithScore) if meanFlag: threshold = riskStatus[0] ## first returned index is mean else: threshold = riskStatus[1] ####### high vScore versions started high_CQ_dict = utility.getHighVScoreVersions_CQ( NonZero_sanitizedVersionsWithScore , sanitizedVersions_CQ , threshold) high_vScore_Dict = utility.getHighVScoreVersions_VScore(NonZero_sanitizedVersionsWithScore, threshold) print "non zero high_vscore_versions ", len(high_vScore_Dict) ####### high vScore versions ended ####### low vScore versions started low_CQ_dict = utility.getLowVScoreVersions_CQ( NonZero_sanitizedVersionsWithScore , sanitizedVersions_CQ , threshold) low_vScore_Dict = utility.getLowVScoreVersions_VScore(NonZero_sanitizedVersionsWithScore, threshold) print "non zero len_vscore_versions ", len(low_vScore_Dict) ####### low vScore versions ended ##### dumpin time ### three ways: first by dumping all highs then all lows themegaFile_Seperated = outputStrParam + "_" + "non_zero_all-CQ-HL-Seperated.csv" IO_.dumpIntoFileByHighAndLow( themegaFile_Seperated, high_CQ_dict, low_CQ_dict ) ### three ways : second by dumping as it si themegaFile_All = outputStrParam + "_" + "non_zero_all-CQ-HL.csv" IO_.dumpIntoFile( themegaFile_All,sanitizedVersions_CQ , NonZero_sanitizedVersionsWithScore, threshold, False ) LGR.performLogiRegression(themegaFile_All)
def mobilesoft_cart(fileNameParam, fileToWriteP): indexVector = [0, 5, 10, 12, 13, 18, 19, 20] testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam) trainData = testAndTrainData[0] testData = testAndTrainData[1] selected_training_data = createMobileSoftFeatures(trainData, indexVector) print "Size of selected training data : ", np.shape(selected_training_data) print "="*50 print "Glimpse at selected features (10th entry): \n", selected_training_data.iloc[9, :] print "="*50 print "Glimpse at labels (10th entry): \n", testData.iloc[9] print "="*50 dict_of_results = param_exp_classifier.runCART(selected_training_data, testData, 0.90) reportStr = param_exp_analysis.analyzeThis(dict_of_results) IO_.writeStrToFile(fileToWriteP, reportStr)
def experiemnt_mobilesoft(dbFileName, outputStrParam): from sklearn import cluster import plotter clusteringType = cluster.AgglomerativeClustering(n_clusters=5) print "Performing experiemnt # Mobilesoft" versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getMobilesoftCodeQualityVersions( versionAndCodeQualityDict, 1.00) sanitizedVersions_CQ = sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getAllVulnerbailityScoreOfSelectedVersions( sanitizedVersions) brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[ 0], brokenDict[1] # strOfScoresToDump="" # for elem in onlyTheNonZeroSanitizedVScores: # strOfScoresToDump = strOfScoresToDump + str(elem) + "," + "\n" # # ## # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump) reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) print "And the labels are .... " print len(labelsFroVersions) cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores) silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels) print "Silhouette average---> ", silhouette_avg # clusteringType = cluster.KMeans(n_clusters=5) # clusteringType.fit(reshapedNonZerSanitizedScores) # centroids = clusteringType.cluster_centers_ # print "And the centroids are .... ", centroids ############################## themegaFile_All = outputStrParam + "_" + "cluster_Headered_1407.csv" IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels) '''
def experiemnt_gaussian_naive_bayes(fileNameParam): testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam) print "This is 'experiemnt_gaussian_naive_bayes' " # settign up train data trainData = testAndTrainData[0] original_rows = trainData.shape[0] original_cols = trainData.shape[1] print "Size of training data : rows: {}, columns: {}".format( original_rows, original_cols) # settign up test data testData = testAndTrainData[1] for selCount in xrange(original_cols): count_ = selCount + 1 if count_ <= original_cols: slected_training_data = giveSelectedTrainingData( trainData, testData, count_) print "################# No. of features to work with={} ############".format( count_) print "Size of selected training data : ", slected_training_data.shape emperiemntSplitters = [ float(x) / float(10) for x in xrange(10) if x > 0 ] for elem in emperiemntSplitters: #print "Training size: {} %".format(float(elem*100)) exp_x_classifiers.runGNB(slected_training_data, testData, elem)
def getData(fileNameParam): testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam) trainData = testAndTrainData[0] testData = testAndTrainData[1] return trainData, testData
def experiemnt_random_forest(fileNameParam, fileToWriteP): testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam) print "This is 'experiemnt_random_forest' " # settign up train data trainData = testAndTrainData[0] original_rows = trainData.shape[0] original_cols = trainData.shape[1] print "Size of training data : rows: {}, columns: {}".format( original_rows , original_cols ) # settign up test data testData = testAndTrainData[1] dict_of_results = param_exp_classifier.runRandomForest(trainData, testData) reportStr = param_exp_analysis.analyzeThis(dict_of_results) IO_.writeStrToFile(fileToWriteP, reportStr)
def experiemnt_CART(fileNameParam): import exp_x_classifiers, IO_ testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam) print "This is 'experiemnt_CART' " # settign up train data trainData = testAndTrainData[0] original_rows = trainData.shape[0] original_cols = trainData.shape[1] print "Size of training data : rows: {}, columns: {}".format( original_rows, original_cols) # settign up test data testData = testAndTrainData[1] # for selCount in xrange(original_cols): # count_ = selCount + 1 # if count_ < original_cols: slected_training_data = giveSelectedTrainingData(trainData, testData, original_cols) print "################# No. of features to work with={} ############".format( original_cols) print "Size of selected training data : ", slected_training_data.shape emperiemntSplitters = [float(x) / float(10) for x in xrange(10) if x > 0] for elem in emperiemntSplitters: #print "Training size: {} %".format(float(elem*100)) param_exp_classifier.runCART(slected_training_data, testData, elem)
def experiemnt_random_forest(fileNameParam): import exp_x_classifiers , IO_ testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam) #print testAndTrainData print "This is 'experiemnt_random_forest' " # settign up train data trainData = testAndTrainData[0] #print trainData original_rows = trainData.shape[0] original_cols = trainData.shape[1] print "Size of training data : rows: {}, columns: {}".format( original_rows , original_cols ) # settign up test data testData = testAndTrainData[1] #print testData for selCount in xrange(original_cols): count_ = selCount + 1 if count_ <= original_cols: slected_training_data = giveSelectedTrainingData(trainData, testData, count_ ) print "################# No. of features to work with={} ############".format(count_) print "Size of selected training data : ", slected_training_data.shape emperiemntSplitters=[float(x)/float(10) for x in xrange(10) if x > 0] for elem in emperiemntSplitters: #print "Training size: {} %".format(float(elem*100)) exp_x_classifiers.runRandomForest(slected_training_data, testData, elem)
def experiemnt_random_forest(fileNameParam, fileToWriteP): testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam) print "This is 'experiemnt_random_forest' " # settign up train data trainData = testAndTrainData[0] original_rows = trainData.shape[0] original_cols = trainData.shape[1] print "Size of training data : rows: {}, columns: {}".format( original_rows, original_cols) # settign up test data testData = testAndTrainData[1] dict_of_results = param_exp_classifier.runRandomForest(trainData, testData) reportStr = param_exp_analysis.analyzeThis(dict_of_results) IO_.writeStrToFile(fileToWriteP, reportStr)
def experiment_mobilesoft_knn(fileNameParam): testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam) trainData = testAndTrainData[0] testData = testAndTrainData[1] #print trainData selected_training_data = getPCAedFeatures(trainData) print "Size of selected training data : ", np.shape(selected_training_data) print "=" * 50 exp_x_classifiers.runKNN(selected_training_data, testData, 0.90) print "=" * 50
def runRandomForest(trainDataParam, testDataParam): res_combo_dict ={} #n_estimators_list=[500] n_estimators_list = [75, 80, 85] criterion_list = ['gini', 'entropy'] #max_features_list = ['auto', 'sqrt', 'log2', None] max_depth_list = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, None] max_leaf_nodes_list = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, None] bootstrap_list = [True, False] #min_samples_split_list = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] #oob_score_list = [True, False] min_weight_fraction_leaf_list = [0.1, 0.2, 0.3, 0.4, 0.5] # cannot be more than 0.50 ### setting the aprameters : test purpose # n_estimators_list=[50, 50000] # criterion_list = ['gini', 'entropy'] # max_features_list=['auto', None] # max_depth_list = [1, 1000 ] # max_leaf_nodes_list = [None, 5, 1000] # in our datset only 549 legit samples so should eb limited to 549 # bootstrap_list=[True, False] # min_samples_split_list = [1, 1000] # in our datset only 549 legit samples so should eb limited to 549 # oob_score_list=[True, False] # min_weight_fraction_leaf_list=[0.0, 0.5] # must be between 0.0 and 0.50 # warm_start_list=[True, False] ### for eti in n_estimators_list: for crit in criterion_list: for max_depth_ in max_depth_list: for max_leaf in max_leaf_nodes_list: for bootstrap_ in bootstrap_list: for mwfratleaf in min_weight_fraction_leaf_list: ## display params: # n_jobs has been set to -1 to use all the cores avialable , not part fo an experiemnt print "##########" print "n_estimators={}, criterion={}, max_dept={}, max_leaf_nodes={}".format(eti, crit, max_depth_, max_leaf ) print "bootstrap={}, min-wt-frac={}".format(bootstrap_, mwfratleaf ) key_str_1 = str(eti) + "_" + crit + "_" + str(max_depth_) + "_" + str(max_leaf) + "_" key_str_2 = str(bootstrap_) + "_" + str(mwfratleaf) + "_" key_for_dict = key_str_1 + key_str_2 ## fire up the model with IO_.duration(): theRndForestModel = RandomForestClassifier( n_estimators=eti, criterion=crit, max_depth=max_depth_, min_weight_fraction_leaf=mwfratleaf, max_leaf_nodes=max_leaf, bootstrap=bootstrap_ ) res_tuple = perform_cross_validation(theRndForestModel, trainDataParam, testDataParam, 10) res_combo_dict[key_for_dict] = res_tuple print "##########" return res_combo_dict
def createcombinatorialFiles(dirParam, doc_count_param, dir_to_write_Param): all_file_names = os.listdir(dirParam) valid_file_names = [x for x in all_file_names if x.endswith("dump")] for elem1 in valid_file_names: elem1_fileName= dirParam + "/" + elem1 elem1_tokns = IO_.readFile(elem1_fileName) #print "Count of tokens for {} is {}".format(elem1, len(elem1_tokns)) for elem2 in valid_file_names: elem2_fileName= dirParam + "/" + elem2 elem2_tokns = IO_.readFile(elem2_fileName) file_name_to_write = elem1 + "_" + elem2 + ".tsv" print "Cmparing {} and {}".format(elem1, elem2) print "Cmparing {} and {}".format(len(elem1_tokns), len(elem2_tokns)) bothfiletokens = pre_process_tokens(elem1_tokns, elem2_tokns, doc_count_param) filtered_elem1_tokns = bothfiletokens[0] filtered_elem2_tokns = bothfiletokens[1] IO_.writeTokensToFile(dir_to_write_Param, file_name_to_write, filtered_elem1_tokns, filtered_elem2_tokns)
def experiment_mobilesoft_knn(fileNameParam, indexVector): testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam) trainData = testAndTrainData[0] testData = testAndTrainData[1] #print trainData selected_training_data = createMobileSoftFeatures(trainData, indexVector) print "Size of selected training data : ", np.shape(selected_training_data) print "=" * 50 print "Glimpse at selected features (10th entry): \n", selected_training_data.iloc[ 9, :] print "=" * 50 print "Glimpse at labels (10th entry): \n", testData.iloc[9] print "=" * 50 exp_x_classifiers.runKNN(selected_training_data, testData, 0.90) print "=" * 50
def runSVM(fileNamaParam, trainizingSizeParam): # what percent will you use ? testSplitSize = 1.0 - trainizingSizeParam testAndTrainData = IO_.giveTestAndTrainingData(fileNamaParam) trainData = testAndTrainData[0] testData = testAndTrainData[1] ### classification ## get the test and training sets featureSpace_train, featureSpace_test, vScore_train, vScore_test = cross_validation.train_test_split(trainData, testData, test_size=testSplitSize, random_state=0) ## fire up the model theSVMModel = svm.SVC(kernel='rbf', C=1).fit(featureSpace_train, vScore_train) thePredictedScores = theSVMModel.predict(featureSpace_test) #print "The original vector: " #print vScore_test #print "The predicted score vector: " #print thePredictedScores evalClassifier(vScore_test, thePredictedScores)
def runCART(fileNamaParam, trainizingSizeParam): # what percent will you use ? testSplitSize = 1.0 - trainizingSizeParam testAndTrainData = IO_.giveTestAndTrainingData(fileNamaParam) trainData = testAndTrainData[0] testData = testAndTrainData[1] ### classification ## get the test and training sets featureSpace_train, featureSpace_test, vScore_train, vScore_test = cross_validation.train_test_split( trainData, testData, test_size=testSplitSize, random_state=0) ## fire up the model theQDAModel = DecisionTreeClassifier() theQDAModel.fit(featureSpace_train, vScore_train) thePredictedScores = theQDAModel.predict(featureSpace_test) #print "The original vector: " #print vScore_test #print "The predicted score vector: " #print thePredictedScores evalClassifier(vScore_test, thePredictedScores)
max_leaf_nodes=10000) mae_for_param_combo_2 = perform_cross_validation( the_Model_2, trainingData, testData, cv_param)[1] t2 = time.time() time_for_param_comb_2 = t2 - t1 mae_list_2.append(mae_for_param_combo_2) time_list_2.append(time_for_param_comb_2) mae_a12_ = a12_utility.doSlowA12(mae_list_1, mae_list_2) time_a12_ = a12_utility.doSlowA12(time_list_2, time_list_1) print "MAE comaprison: is default worse than 'best combo' ?", mae_a12_ print "time comaprison: is 'best' combo slower than default ?", time_a12_ datasetFileName = "13_NonZeroDataset_Aggolo.csv" iterations = 10000 cv_param = 5 print "========== Random Forest ==========" with IO_.duration(): runRFTest(datasetFileName, iterations, cv_param) print "========== KNN ==========" with IO_.duration(): runknnTest(datasetFileName, iterations, cv_param) print "========== SVM ==========" with IO_.duration(): runsvmTest(datasetFileName, iterations, cv_param) print "========== CART ==========" with IO_.duration(): runCARTTest(datasetFileName, iterations, cv_param)
def giveSelectedTrainingData(trainParam, testParam, no_of_chices_param): from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 train_data_new = SelectKBest(chi2, k=no_of_chices_param).fit_transform( trainParam, testParam) return train_data_new ####### Open loggger #### old_stdout = sys.stdout output_file_name = "param_exp_random_forest_500_two_folds.txt" log_file = open(output_file_name, "w") sys.stdout = log_file print "Started at: ", IO_.giveTimeStamp() fileNameParam = "13_NonZeroDataset_Aggolo.csv" fileToWrite = "param_exp_combo_report_500_two_folds.csv" experiemnt_random_forest(fileNameParam, fileToWrite) #experiemnt_SVM(fileNameParam) #experiemnt_KNN(fileNameParam) #experiemnt_CART(fileNameParam) print "Done ;-)" print "Ended at: ", IO_.giveTimeStamp() #### close logger sys.stdout = old_stdout log_file.close()
for value_for_one_classifier in valueListParam: comparer = value_for_one_classifier comparees = [x for x in valueListParam if x!=value_for_one_classifier] print "---" for comparee_item in comparees: #print "comparer: {}, comapree: {}".format(comparer, comparee_item) a12_results = a12_utility.doSlowA12(comparer, comparee_item) print "----->", a12_results ####### Open loggger #### old_stdout = sys.stdout output_file_name="a12_res_2Clusters.txt" log_file = open( output_file_name, "w") sys.stdout = log_file print "Started at: ", IO_.giveTimeStamp() count=10000 file_="2Clusters_NonZeroDataset_Aggolo.csv" all_accu_moea = runs(count, file_) all_acuu = all_accu_moea[0] all_moea = all_accu_moea[1] #print "**************** Hypo. tests for Accuracy ****************" #stat_hypo_test_(all_acuu) #print "**************** Hypo. tests for Mean Abs. Error ****************" #stat_hypo_test_(all_moea) print "**************** A12 tests for Accuracy ****************" stat_a12_test_(all_acuu) print "**************** A12 tests for Mean Abs. Error ****************" stat_a12_test_(all_moea) print "Ended at: ", IO_.giveTimeStamp()
mae_for_param_combo_2 = perform_cross_validation(the_Model_2, trainingData, testData, cv_param)[1] t2 = time.time() time_for_param_comb_2 = t2 - t1 mae_list_2.append(mae_for_param_combo_2) time_list_2.append(time_for_param_comb_2) mae_a12_ = a12_utility.doSlowA12(mae_list_1, mae_list_2) time_a12_ = a12_utility.doSlowA12(time_list_2, time_list_1) print "MAE comaprison: is default worse than 'best combo' ?", mae_a12_ print "time comaprison: is 'best' combo slower than default ?", time_a12_ datasetFileName="13_NonZeroDataset_Aggolo.csv" iterations=10000 cv_param = 5 print "========== Random Forest ==========" with IO_.duration(): runRFTest(datasetFileName, iterations, cv_param) print "========== KNN ==========" with IO_.duration(): runknnTest(datasetFileName, iterations, cv_param) print "========== SVM ==========" with IO_.duration(): runsvmTest(datasetFileName, iterations, cv_param) print "========== CART ==========" with IO_.duration(): runCARTTest(datasetFileName, iterations, cv_param)
def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag): from sklearn import cluster import plotter clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=13) else: clusteringType = cluster.AgglomerativeClustering(n_clusters=13) print "Performing experiemnt # 3: Clustering score into two clusters " versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions( versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions #print "Sanitized versions that will be used in study ", len(sanitizedVersions) #print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions( sanitizedVersions) #print "zzzz", len(NonZero_sanitizedVersionsWithScore) ### dyumping scores ... brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[ 0], brokenDict[1] #print "lalalaa ", onlyTheNonZeroSanitizedVScores #strOfScoresToDump="" #for elem in onlyTheNonZeroSanitizedVScores: # strOfScoresToDump = strOfScoresToDump + str(elem) + "," + "\n" ### #IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump) reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ if clusterFlag: centroids = clusteringType.cluster_centers_ print "And the centroids are .... ", centroids NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) ##### plotting clusters start #low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore) #low_cluster_x = [ 22.35294118 for x in low_cluster_y] #hig_cluster_x = [ 50.82030058 for x in high_cluster_y] #plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y) ##### plottign clusters end else: print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) print "And the labels are .... " print len(labelsFroVersions) cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores) silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels) print "Silhouette average---> ", silhouette_avg ############################## themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv" IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels, False)
def runRandomForest(trainDataParam, testDataParam): res_combo_dict ={} # ### setting the aprameters n_estimators_list=[500] #n_estimators_list=[10, 50, 100, 500] criterion_list = ['gini', 'entropy'] max_features_list=['auto', 'sqrt', 'log2', None] max_depth_list = [5, 15, 50, None ] max_leaf_nodes_list = [None, 25, 50, 75] # in our datset only 549 legit samples so should eb limited to 549 bootstrap_list=[True, False] min_samples_split_list = [1, 25, 50, 100] # in our datset only 549 legit samples so should eb limited to 549 oob_score_list=[True, False] min_weight_fraction_leaf_list=[0.0, 0.2, 0.3, 0.4] # must be between 0.0 and 0.50 warm_start_list=[True, False] # ### ### setting the aprameters : test purpose # n_estimators_list=[50, 50000] # criterion_list = ['gini', 'entropy'] # max_features_list=['auto', None] # max_depth_list = [1, 1000 ] # max_leaf_nodes_list = [None, 5, 1000] # in our datset only 549 legit samples so should eb limited to 549 # bootstrap_list=[True, False] # min_samples_split_list = [1, 1000] # in our datset only 549 legit samples so should eb limited to 549 # oob_score_list=[True, False] # min_weight_fraction_leaf_list=[0.0, 0.5] # must be between 0.0 and 0.50 # warm_start_list=[True, False] ### for eti in n_estimators_list: for crit in criterion_list: for maxfeat in max_features_list: for max_depth_ in max_depth_list: for max_leaf in max_leaf_nodes_list: for bootstrap_ in bootstrap_list: for min_sample in min_samples_split_list: if bootstrap_==False: oob_score_list=[False, False] for oob_ in oob_score_list: for mwfratleaf in min_weight_fraction_leaf_list: for warm_start_ in warm_start_list: ## display params: # n_jobs has been set to -1 to use all the cores avialable , not part fo an experiemnt print "##########" print "n_estimators={}, criterion={}, max_features={}, max_dept={}, max_leaf_nodes={}".format(eti, crit, maxfeat, max_depth_, max_leaf ) print "bootstrap={}, min-sample-split={}, oob_score={}, min-wt-frac={}, warm-start={}".format(bootstrap_, min_sample, oob_, mwfratleaf, warm_start_ ) key_str_1 = str(eti) + "_" + crit + "_" + str(maxfeat) + "_" + str(max_depth_) + "_" + str(max_leaf) + "_" key_str_2 = str(bootstrap_) + "_" + str(min_sample) + "_" + str(oob_) + "_" + str(mwfratleaf) + "_" +str(warm_start_) key_for_dict = key_str_1 + key_str_2 ## fire up the model with IO_.duration(): theRndForestModel = RandomForestClassifier( n_estimators=eti, criterion=crit, max_depth=max_depth_, min_samples_split=min_sample, max_features=maxfeat, min_weight_fraction_leaf=mwfratleaf, max_leaf_nodes=max_leaf, bootstrap=bootstrap_, oob_score=oob_, n_jobs=-1 , warm_start=warm_start_ ) res_tuple = perform_cross_validation(theRndForestModel, trainDataParam, testDataParam, 2) res_combo_dict[key_for_dict] = res_tuple print "##########" return res_combo_dict