def testHypothesis(metric): minimalData, equivalentData = getDataByFile('{}/ML/Results/Summary/Summary_BestClassifiers_All30Runs.csv'.format(os.getcwd()), 'F1') # ======================================== # ===== Hypothesis Testing =============== # ======================================== for model, data in [['MINIMAL', minimalData], ['EQUIVALENT', equivalentData]]: print(int(len(model) + 20 + 2) * '=') print('===== {} ==============='.format(model)) print(int(len(model) + 20 + 2) * '=') allClassifiers = getPossibleClassifiers() classifiersToBeTested = getPossibleClassifiers() for classifierA in allClassifiers: if len(classifiersToBeTested) > 0: classifiersToBeTested.remove(classifierA) for classifierB in classifiersToBeTested: print('{} x {}'.format(classifierA, classifierB)) dataA = data.query('Classifier == \'{}\''.format(classifierA)) dataB = data.query('Classifier == \'{}\''.format(classifierB)) if (classifierA == 'GNB' or classifierB == 'GNB'): runWilcoxon(dataA[metric], dataB[metric], 0.05) else: runTTestPaired(dataA[metric], dataB[metric], 0.05) print('') print('')
def setUp(): possibleTargetColumns = ['MINIMAL', 'EQUIVALENT'] possibleClassifiers = getPossibleClassifiers() possiblePrograms = [ util.getPathName(program) for program in util.getPrograms() ] return possibleTargetColumns, possibleClassifiers, possiblePrograms
def testDataDistribution(metric): minimalData, equivalentData = getDataByFile('{}/ML/Results/Summary/Summary_BestClassifiers_All30Runs.csv'.format(os.getcwd()), 'F1') # ============================================ # ===== Test Data distribution =============== # ============================================ for model, data in [['MINIMAL', minimalData], ['EQUIVALENT', equivalentData]]: for classifier in getPossibleClassifiers(): print('{} - {}'.format(model, classifier)) runShapiroTest(data.query('Classifier == \'{}\''.format(classifier))[metric], 0.05) print('') print('')
def executeAll(targetColumns, classifiers, specifiedProgram=None, executeWithBestParameter=False): ''' Function used to execute all classifiers in all columns to be sorted ''' for column in targetColumns: for classifier in getPossibleClassifiers(): print('Classifier: {} | Column: {}'.format(classifier, column)) parameter = bestParameter( column, classifier) if executeWithBestParameter else None crossValidation(column, classifier, specifiedProgram, parameter=parameter)
def experiment(): baseFolder = '{}/ML/Results'.format(os.getcwd()) targetColumns = getPossibleTargetColumns() targetColumns.sort() classifiers = getPossibleClassifiers() classifiers.sort() dropableColumns = getDropableColumns() dropableColumns.sort() resultsFile = [] for targetColumn in targetColumns: for classifier in classifiers: originalFile = '{}/{}/{}.csv'.format(baseFolder, targetColumn, classifier) columnsResult = [] f1 = float(getMaxF1(originalFile)) columnsResult.append(f1) print('{} - {} - Original: {:.2f}'.format(targetColumn, classifier, f1)) for column in dropableColumns: columnFile = '{}/{}/{} - gbs_[\'{}\'].csv'.format( baseFolder, targetColumn, classifier, column) f1 = float(getMaxF1(columnFile)) columnsResult.append(f1) print('{} - {} - {}: {:.2f}'.format(targetColumn, classifier, column, f1)) resultsFile.append([targetColumn, classifier, columnsResult]) util.writeInCsvFile('{}/ML/Results/gbs.csv'.format(os.getcwd()), resultsFile)
def classify_main(arguments): ''' Function responsible for receiving a mutant dataset and classifying those mutants as minimal, equivalent or traditional. ''' # Possible parameters possibleTargetColumns = getPossibleTargetColumns() possibleClassifiers = getPossibleClassifiers() possiblePrograms = [ util.getPathName(program) for program in util.getPrograms('{}/Programs'.format(os.getcwd())) ] # Parameters targetColumn = None allTargetColumns = False programToClassify = None classifier = None algorithmParameter = None executeAllPrograms = False executeBestClassifierForProgram = False programsBestClassifiers = None executeAllClassifiers = False executeAllParameters = False # Trought into all parameters for iCount in range(1, len(arguments), 1): arg = arguments[iCount] if arg == '--column': targetColumn = arguments[iCount + 1] elif arg == '--allColumns': allTargetColumns = True elif arg == '--program': programToClassify = arguments[iCount + 1] elif arg == '--allPrograms': executeAllPrograms = True elif arg == '--classifier': classifier = arguments[iCount + 1] elif arg == '--bestClassifier': executeBestClassifierForProgram = True programsBestClassifiers = analyzes.getBestClassifierForPrograms() elif arg == '--allClassifiers': executeAllClassifiers = True elif arg == '--parameter': algorithmParameter = int(arguments[iCount + 1]) elif arg == '--allParameters': executeAllParameters = True withoutProgramMessage = 'Please specify the program correctly. The {program} could be ' + str( possiblePrograms) withoutColumnMessage = 'Please specify the target column throught --column {targetColumn}. The {targetColumn} could be ' + str( possibleTargetColumns) withoutClassifierMessage = 'Please specify the classifier to be used throught --classifier {classifier}. The {classifier} could be ' + str( possibleClassifiers) errorMessage = '' if (targetColumn is None or not targetColumn in possibleTargetColumns) and allTargetColumns == False: errorMessage = '{}{}\n'.format(errorMessage, withoutColumnMessage) if programToClassify is None and executeAllPrograms == False: errorMessage = '{}{}\n'.format(errorMessage, withoutProgramMessage) if classifier is None and executeBestClassifierForProgram == False and executeAllClassifiers == False: errorMessage = '{}{}\n'.format(errorMessage, withoutClassifierMessage) if len(errorMessage) > 0: print(errorMessage) return if executeAllPrograms: programsToBeClassified = possiblePrograms.copy() else: programsToBeClassified = [programToClassify] if allTargetColumns: targetColumns = possibleTargetColumns.copy() else: targetColumns = [targetColumn] for column in targetColumns: for program in programsToBeClassified: if executeBestClassifierForProgram: classifier, _ = programsBestClassifiers['{}_{}'.format( program, column)] if executeAllClassifiers: classifiers = possibleClassifiers else: classifiers = [classifier] for _classifier in classifiers: if executeAllParameters: parameters = getPossibleParameters(_classifier) else: if _classifier == 'SVM' or _classifier == 'LDA' or _classifier == 'LR' or _classifier == 'GNB': parameters = [''] elif _classifier == 'KNN' and column == 'MINIMAL': parameters = [1] elif _classifier == 'KNN' and column == 'EQUIVALENT': parameters = [11] elif _classifier == 'DT' and column == 'MINIMAL': parameters = [15] elif _classifier == 'DT' and column == 'EQUIVALENT': parameters = [35] elif _classifier == 'RF' and column == 'MINIMAL': parameters = [5] elif _classifier == 'RF' and column == 'EQUIVALENT': parameters = [15] for parameter in parameters: complementClassifierName = '_{}'.format( _classifier) if executeAllClassifiers else '' complementClassifierName = '{baseName}{parameter}'.format( baseName=complementClassifierName, parameter='_{}'.format(parameter) if executeAllParameters else '') dataSetFileName = '{}/ML/Dataset/{}/Programs/{}.csv'.format( os.getcwd(), column, program) resultDataSetFileName = '{baseFolder}/ML/Results/{targetColumn}/Classification/{programName}{complement}.csv'.format( baseFolder=os.getcwd(), targetColumn=column, programName=program, complement=complementClassifierName) print( '\nProgram: {} | Column: {} | Classifier: {} | Parameter: {}' .format(program, column, _classifier, parameter)) if parameter != '': classify(dataSetFileName, resultDataSetFileName, column, _classifier, parameter, program)
def debug_main(arguments): ''' Main function performed at the time of running the experiment ''' # Possible parameters possibleTargetColumns = getPossibleTargetColumns() possibleClassifiers = getPossibleClassifiers() possiblePrograms = [ util.getPathName(program) for program in util.getPrograms('{}/Programs'.format(os.getcwd())) ] # Parameters targetColumn = None classifier = None columnsToDrop = [] columnsToAdd = [] program = None programByProgram = False executeWithBestParameter = False # Trought into all parameters for iCount in range(1, len(arguments), 1): arg = arguments[iCount] if arg == '--column': targetColumn = arguments[iCount + 1] elif arg == '--classifier': classifier = arguments[iCount + 1] elif arg == '--program': program = arguments[iCount + 1] elif arg == '--pbp': programByProgram = True elif arg == '--best': executeWithBestParameter = True # Set the best parameter if it is necessary parameter = bestParameter(targetColumn, classifier) if executeWithBestParameter else None if len(arguments) > 1: if arguments[ 1] == '--all': # Verify if it is for execute all classifiers with all classifications executeAll(possibleTargetColumns, possibleClassifiers, parameter, executeWithBestParameter=executeWithBestParameter) return elif arguments[ 1] == '--allPbP': #Verify if it is for execute all, but program a program executeAllEachProgram(possibleTargetColumns, possibleClassifiers, possiblePrograms, executeWithBestParameter) return withoutColumnMessage = 'Please specify the target column throught --column {targetColumn}. The {targetColumn} could be ' + str( possibleTargetColumns) withoutClassifierMessage = 'Please specify the classifier throught --classifier {classifier}. The {classifier} could be ' + str( possibleClassifiers) withoutProgramMessage = 'Please specify the program correctly. The {program} could be ' + str( possiblePrograms) errorMessage = '' if targetColumn is None or not targetColumn in possibleTargetColumns: errorMessage = '{}{}\n'.format(errorMessage, withoutColumnMessage) if classifier is None: errorMessage = '{}{}\n'.format(errorMessage, withoutClassifierMessage) if not program is None and not program in possiblePrograms: errorMessage = '{}{}\n'.format(errorMessage, withoutProgramMessage) if len(errorMessage) > 0: print(errorMessage) return # Execute cross validation if not programByProgram: crossValidation(targetColumn, classifier, program, columnsToDrop, columnsToAdd, parameter=parameter) else: for specifiedProgram in possiblePrograms: crossValidation(targetColumn, classifier, specifiedProgram, columnsToDrop, columnsToAdd, parameter=parameter)
def crossValidation(targetColumn, classifier, specifiedProgram=None, columnsToDrop=[], columnsToAdd=[], printResults=False, parameter=None): if not classifier in getPossibleClassifiers( ) or not targetColumn in getPossibleTargetColumns(): return None #################################### # --- Setting independent properties maxNeighbors = 40 maxSamplesSplit = 100 maxIterations = maxNeighbors if classifier == 'KNN' else maxSamplesSplit ###################### # --- Setting datasets targetColumnName = targetColumn targetColumn = '_IM_{}'.format(targetColumn) # Verify if it setted a specified program to be classified if not specifiedProgram is None: dataSetFileName = 'ML/Dataset/{}/Programs/{}.csv'.format( targetColumnName, specifiedProgram) else: dataSetFileName = 'ML/Dataset/{}/mutants.csv'.format(targetColumnName) if targetColumn == '_IM_MINIMAL': ##################### # --- Setting columns columnNames = getColumnNames_lastMinimal() print('####################################################') print(' ----- Calculando para identificar mutantes minimais') elif targetColumn == '_IM_EQUIVALENT': ##################### # --- Setting columns columnNames = getColumnNames_lastEquivalent() print('########################################################') print(' ----- Calculando para identificar mutantes equivalentes') else: return ################### # --- PreProcessing dataSet = importDataSet(dataSetFileName) ############################## # --- Setting results filename bestParameter = '_bestParameter' if not parameter is None else '' gbs = ' - gbs_{columns}'.format( columns=columnsToDrop) if len(columnsToDrop) > 0 else '' gfs = ' - gfs_{columns}' if len(columnsToAdd) > 0 else '' if specifiedProgram is None: resultsFileName = 'ML/Results/{targetColumnName}/{classifier}{bestParameter}{gbs}{gfs}.csv'.format( targetColumnName=targetColumnName, classifier=classifier, gbs=gbs, gfs=gfs, bestParameter=bestParameter) else: resultsFileName = 'ML/Results/{targetColumnName}/Programs/{specifiedProgram}_{classifier}{bestParameter}.csv'.format( targetColumnName=targetColumnName, specifiedProgram=specifiedProgram, classifier=classifier, bestParameter=bestParameter) ########################################## # --- Executing classifier | KNN, DT ou RF print(' ----- {}'.format(classifier)) crossValidation_main(dataSet, targetColumn, classifier, maxIterations, resultsFileName, columnNames, columnsToDrop, columnsToAdd, parameter=parameter)