def findDuplicateCols(fileData, requiredCol, csvFile): # pandas dataframe appends '.1.' to duplicate columns so check if 'requiredCol'.1 exists newCol = str(requiredCol) + '.1' if newCol in fileData.columns: prints.err( "{0} column is duplicated in the {1}. Terminating Program".format( requiredCol, csvFile))
def studentsHandler(studentsFileData, progMode): # Series indexed by student global studentID global studentGPA global studentESL global studentPriority global studentAvoid global studentAssignment # DataFrame global studentChoiceN # Scalar global numStudents global numStudentChoices errFlag = False def checkExists(dataSeries): errNan = False for student in range(numStudents): # Verify series has no NaN values if pd.isna(dataSeries[student]) is True: # If element empty prints.logerr( "Empty element found in {0} column, row {1}".format( dataSeries.name, student)) errNan = True return errNan def checkInt(dataSeries): errInt = False if is_numeric_dtype(dataSeries) is False: # Look at series label errInt = True for student in range(numStudents): if dataSeries[student].isnumeric() is False and pd.isna( dataSeries[student]) is False: # Look at element prints.logerr( "Unexpected data found in {0}, row {1} = '{2}'".format( dataSeries.name, student, dataSeries[student])) elif is_integer_dtype(dataSeries) is False: for student in range(numStudents): if not dataSeries[student].is_integer() and pd.isna( dataSeries[student]) is False: prints.logerr( "Unexpected data found in {0}, row {1} = '{2}'".format( dataSeries.name, student, dataSeries[student])) return errInt def checkFloat(dataSeries): errFloat = False if is_numeric_dtype(dataSeries) is False: errFloat = True for student in range(numStudents): try: float(dataSeries[student]) except ValueError: prints.logerr( "Unexpected data found in {0}, row {1} = '{2}'".format( dataSeries.name, student, dataSeries[student])) return errFloat def checkBool(dataSeries): errBool = False if is_bool_dtype(dataSeries) is False: errBool = True for student in range(numStudents): val = dataSeries[student] if val is not False: val = val.lower() if val is not False and val != 'true' and val != 'false': prints.logerr( "Unexpected data found in {0}, row {1} = '{2}'".format( dataSeries.name, student, dataSeries[student])) return errBool def checkRange(dataSeries, rmin, rmax): errRange = False for value in dataSeries: if (value < rmin) or (value > rmax): prints.logerr( '{0} outside of acceptable range for {1}. Accepted values are between {2} - {3}' .format(value, dataSeries.name, rmin, rmax)) errRange = True return errRange def checkDup(dataSeries): errDup = False if dataSeries.duplicated().any(): errDup = True sDuplicates = dataSeries[dataSeries.duplicated()] for i in range(len(sDuplicates)): prints.logerr("Duplicate {0} found '{1}'".format( dataSeries.name, dataSeries[dataSeries.duplicated()].iloc[i])) return errDup def match1D(dataSeries, parentData): # dataSeries is being checked against parentData errMatch = False for student in range(numStudents): if pd.isna(dataSeries[student]) is False: # If element not empty sMatch = False for j in range(len( parentData)): # Find matching element in parentData if (dataSeries[student] == parentData[j]): sMatch = True # replace element with parentData index dataSeries.at[student] = j break if sMatch is False: prints.logerr( "No match found in {0} column, row {1} = {2:n}".format( dataSeries.name, student, dataSeries[student])) errMatch = True return errMatch def matchProject2D(dataFrame): errProj = False # when Pandas finds unexpected data type, elements in dataFrame cast as objects, not int for col in dataFrame.columns: # Set flag for typecasting when column is not numeric numeric = True if is_numeric_dtype(dataFrame[col]) is False: numeric = False for row in dataFrame.index: if pd.isna( dataFrame[col][row]) is False: # If element not empty # Numeric typecasting for when a letter is present in the column if numeric is False: try: dataFrame.at[row, col] = int(dataFrame[col][row]) except: prints.logerr( "Unexpected data found in {0}, row {1} = '{2}'" .format(col, row, dataFrame[col][row])) errProj = True for i in range(len(projectIDs) ): # Find matching id in global projectIDs sChoiceMatch = False if (dataFrame[col][row] == projectIDs[i]): sChoiceMatch = True # replace project id with project index dataFrame.at[row, col] = i break if sChoiceMatch is False: prints.logerr( "No matching project id found for {0} = '{1}'". format(col, dataFrame[col][row])) errProj = True return errProj def checkStudentID(studentID): errID = False if checkExists(studentID): errID = True if errID is False and checkInt(studentID): errID = True if errID is False and checkDup(studentID): errID = True if errID is False and checkRange(studentID, 0, maxValue): errID = True return errID def checkStudentGPA(studentGPA): errGPA = False if checkExists(studentGPA): errGPA = True if errGPA is False and checkFloat(studentGPA): errGPA = True if errGPA is False and checkRange(studentGPA, 0.0, 4.0): errGPA = True return errGPA def checkStudentESL(studentESL): errESL = False if checkBool(studentESL): errESL = True return errESL def checkStudentPriority(studentPriority): errPri = False if checkBool(studentPriority): errPri = True return errPri def checkAssignment(studentAssignment): errAssign = False if checkExists(studentAssignment): errAssign = True if errAssign is False and checkInt(studentAssignment): errAssign = True if errAssign is False and match1D(studentAssignment, projectIDs): errAssign = True return errAssign def checkAvoid(studentAvoid): errAvoid = False if checkInt(studentAvoid): errAvoid = True if errAvoid is False and match1D(studentAvoid, studentID): errAvoid = True return errAvoid def checkChoices(studentChoiceN): errChoice = False if checkExists(studentChoiceN['studentChoice1']): errChoice = True if matchProject2D(studentChoiceN): errChoice = True return errChoice # Verify required columns are present requiredColumns = [ 'studentID', 'studentChoice1', 'studentGPA', 'studentESL', 'studentAvoid', 'studentPriority' ] for col in requiredColumns: if col not in studentsFileData.columns: prints.err( "Required {0} column header not found in the students csv file. Terminating Program." .format(col)) else: findDuplicateCols(studentsFileData, col, 'Students CSV File') # Search for sequential studentChoice columns to store in global dataframe choiceFields = ['studentChoice1'] studentsCols = list(studentsFileData) for i in range(1, len(studentsFileData.columns) ): # iterate through each column in the studentsFileData sChoiceI = 'studentChoice' + str(i) if sChoiceI in studentsCols and i != 1: choiceFields.append(sChoiceI) # Create list of found header names elif sChoiceI not in studentsCols: break # Store studentChoice columns in global dataframe studentChoiceN = studentsFileData[choiceFields].copy() # Create global series studentID = studentsFileData['studentID'].copy() studentGPA = studentsFileData['studentGPA'].copy() studentESL = studentsFileData['studentESL'].fillna(False) studentPriority = studentsFileData['studentPriority'].fillna(False) studentAvoid = studentsFileData['studentAvoid'].copy() # Define global variable numStudents = len(studentsFileData) numStudentChoices = len(studentChoiceN.columns) # Verify student ID data if checkStudentID(studentID): errFlag = True else: # if student ID data is correct verify student avoid data if checkAvoid(studentAvoid): errFlag = True # Verify studentGPA data if checkStudentGPA(studentGPA): errFlag = True # Verify studentESL data if checkStudentESL(studentESL): errFlag = True # Verify student Priority data if checkStudentPriority(studentPriority): errFlag = True # Verify student Choices if checkChoices(studentChoiceN): errFlag = True # Verify assignment column when in assignment mode if progMode == 'Scoring': if 'assignment' in studentsFileData.columns: # Store assignment column studentAssignment = studentsFileData['assignment'].copy() if checkAssignment(studentAssignment): errFlag = True else: prints.err("No assignment column found. Terminating program.") return errFlag
def projectsHandler(projectsFileData): # arrays indexed by project global minTeamSize global maxTeamSize global projectIDs projectsErrFlag = False # verify required project csv headers are present and not duplicated requiredColumns = ['projectID', 'minTeamSize', 'maxTeamSize'] for col in requiredColumns: if col not in projectsFileData.columns: prints.err( "Required {0} column not found in the projects csv file. Terminating Program." .format(col)) else: findDuplicateCols(projectsFileData, col, 'Projects CSV file') # verify that all values in program csv file are integers projectIDs, intErr = int_checker('projectID', None, projectsFileData, 1) projectsErrFlag = check_error_flag(intErr, projectsErrFlag) # verify that there are no duplicate project IDs in the projectID column if projectsFileData.projectID.duplicated().any(): projectsErrFlag = True projectDuplicates = projectsFileData[ projectsFileData.projectID.duplicated()] for i in range(len(projectDuplicates)): prints.logerr("Duplicate projectID found: {0}".format( projectsFileData[ projectsFileData.projectID.duplicated()].iloc[i])) # if values for team sizes are blank, enter size from settings csv and then verify all values are integers projectsFileData['minTeamSize'] = projectsFileData['minTeamSize'].fillna( defaultMinTeamSize) minTeamSize, intErr = int_checker('minTeamSize', None, projectsFileData, 1) projectsErrFlag = check_error_flag(intErr, projectsErrFlag) projectsFileData['maxTeamSize'] = projectsFileData['maxTeamSize'].fillna( defaultMaxTeamSize) maxTeamSize, intErrFlag = int_checker('maxTeamSize', None, projectsFileData, 1) projectsErrFlag = check_error_flag(intErr, projectsErrFlag) # verify minTeamSize is not greater than maxTeamSize # zip() used to iterate in parallel over multiple iterables for minSize, maxSize, pid in zip(minTeamSize, maxTeamSize, projectIDs): if minSize > maxSize: projectsErrFlag = True prints.logerr( "minTeamSize is greater than maxTeamSize for projectID {0}.". format(pid)) # warn user if gap found in projectID sequence, assuming projectIDs start at projectID '1' # arithmetic series = (n(firstNum + lastNum)) / 2, where n is # of terms in sequence, # then subtract real sum of projectIDs try: projectIDGap = projectIDs[-1] * (projectIDs[0] + projectIDs[-1]) / 2 - sum(projectIDs) except ValueError: projectIDGap = 0 # temp value to pass next if statement. Cause of error would have already been identified. if not projectIDGap == 0: prints.warn( "gap found in projectID sequence in the projects csv file.") return projectsErrFlag
def settingsHandler(settingsFileData): global weightMaxLowGPAStudents global weightMaxESLStudents global weightMaxTeamSize global weightMinTeamSize global weightStudentPriority global weightStudentChoice1 global weightAvoid global effort global maxLowGPAStudents global maxESLStudents global lowGPAThreshold global defaultMaxTeamSize global defaultMinTeamSize settingsErrFlag = False # verify required settings csv headers are present and not duplicated requiredColumns = ['name', 'min', 'max', 'points'] for col in requiredColumns: if col not in settingsFileData.columns: prints.err( "Required {0} column header not found in the settings csv. Terminating Program." .format(col)) else: findDuplicateCols(settingsFileData, col, 'Settings CSV file') # verify required settings csv rows are present and not duplicated requiredRows = [ 'teamSize', 'lowGPAThreshold', 'maxLowGPAStudents', 'maxESLStudents', 'weightMaxLowGPAStudents', 'weightMaxESLStudents', 'weightMinTeamSize', 'weightMaxTeamSize', 'weightStudentPriority', 'weightStudentChoice1', 'weightAvoid', 'effort' ] for row in requiredRows: if row not in settingsFileData['name'].values: prints.err( "Required {0} row not found in the settings csv file. Terminating Program." .format(row)) if len(settingsFileData[settingsFileData['name'] == row]) > 1: prints.err( "Required {0} row is duplicated in the settings csv file. Terminating Program." .format(row)) # verify required fields in 'points' column contain integers # if they are, assign value to global variable for scoring function to use weightMaxLowGPAStudents, intErr = int_checker( None, 'weightMaxLowGPAStudents', (settingsFileData.set_index('name').at['weightMaxLowGPAStudents', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightMaxESLStudents, intErr = int_checker( None, 'weightMaxESLStudents', (settingsFileData.set_index('name').at['weightMaxESLStudents', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightMinTeamSize, intErr = int_checker( None, 'weightMinTeamSize', (settingsFileData.set_index('name').at['weightMinTeamSize', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightMaxTeamSize, intErr = int_checker( None, 'weightMaxTeamSize', (settingsFileData.set_index('name').at['weightMaxTeamSize', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightStudentPriority, intErr = int_checker( None, 'weightStudentPriority', (settingsFileData.set_index('name').at['weightStudentPriority', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightStudentChoice1, intErr = int_checker( None, 'weightStudentChoice1', (settingsFileData.set_index('name').at['weightStudentChoice1', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightAvoid, intErr = int_checker( None, 'weightAvoid', (settingsFileData.set_index('name').at['weightAvoid', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) # verify required values in 'max' and 'min' column are integers # if they are, assign value to global variable for scoring function to use defaultMaxTeamSize, intErr = int_checker( None, 'teamSize', (settingsFileData.set_index('name').at['teamSize', 'max']), 1) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) defaultMinTeamSize, intErr = int_checker( None, 'teamSize', (settingsFileData.set_index('name').at['teamSize', 'min']), 1) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) maxLowGPAStudents, intErr = int_checker( None, 'maxLowGPAStudents', (settingsFileData.set_index('name').at['maxLowGPAStudents', 'max']), 1) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) maxESLStudents, intErr = int_checker( None, 'maxESLStudents', (settingsFileData.set_index('name').at['maxESLStudents', 'max']), 1) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) # if effort value is provided in the csv file, verify the value is an integer within the required range # if it is not, use default value try: effort = int(settingsFileData.set_index('name').at['effort', 'max']) except ValueError: prints.warn( "valid 'effort' value not found in the settings csv. Running with default value." ) if effort < minEffort or effort > maxEffort: effort = defaultEffort prints.warn( "'effort' in the settings csv is not an int between 1 and 100. Running with default value of 20." ) # verify that provided lowGPAThreshold is not empty and that it's a float within range # If it is, assign value to global variable for scoring function to use try: lowGPAThreshold = float( settingsFileData.set_index('name').at['lowGPAThreshold', 'min']) except ValueError: prints.logerr("The lowGPAThreshold 'min' value is not a float.") settingsErrFlag = True lowGPAThreshold = 0 # temp value to pass statement below to allow program to continue checking for errors if (lowGPAThreshold < minGPAThreshold) or ( lowGPAThreshold > maxGPAThreshold) or (pd.isna(lowGPAThreshold)): prints.logerr( "lowGPAThreshold 'min' setting requires a 0.00 - 4.00 value.") settingsErrFlag = True return settingsErrFlag
def argumentParser(): global scoreBreakdown global programMode global mutationProbability global populationSize global eliteRatio global crossoverProbability global parentsPortion # accepted command line arguments parser = argparse.ArgumentParser() parser.add_argument("-s", "--students", help="Students CSV filename", required=False, default='students.csv') parser.add_argument("-p", "--projects", help="Projects CSV filename", required=False, default='projects.csv') parser.add_argument("-u", "--settings", help="Settings CSV filename", required=False, default='settings.csv') parser.add_argument("-o", "--output", help="Output CSV filename", required=False, default='assign.csv') parser.add_argument("-a", "--assign", help="Run program in Assignment mode", required=False, action='store_true') parser.add_argument("-c", "--score", help="Run the program in Scoring mode", required=False, action='store_true') parser.add_argument("-b", "--breakdown", help="Display score breakdown", required=False, action='store_true') parser.add_argument("-mutation", "--mutation", help="Mutation Probability", required=False, default=0.02) parser.add_argument("-population", "--population", help="Population Size", required=False, default=100) parser.add_argument("-elite", "--elite", help="Population Size", required=False, default=0.01) parser.add_argument("-crossover", "--crossover", help="Population Size", required=False, default=0.5) parser.add_argument("-parents", "--parents", help="Population Size", required=False, default=0.3) argument = parser.parse_args() # assign csv file names and mode preference if argument.students: studentsFileName = argument.students if argument.projects: projectsFileName = argument.projects if argument.settings: settingsFileName = argument.settings if argument.output: outputFileName = argument.output if argument.score: programMode = 'Scoring' if argument.assign: programMode = 'Assignment' if argument.score: prints.warn( "both Scoring (-c) and Assignment (-a) modes selected. Program will run in Assignment mode." ) if argument.breakdown: scoreBreakdown = True if argument.mutation: mutationProbability = float(argument.mutation) if mutationProbability < 0.0 or mutationProbability > 1.0: prints.warn( "Mutation Probability is out of required range, defaulting to 0.02" ) mutationProbability = 0.02 if argument.population: try: populationSize = int(argument.population) except ValueError: prints.warn("Population Size is not an integer, defaulting to 100") populationSize = 100 if argument.elite: eliteRatio = float(argument.elite) if eliteRatio < 0.0 or eliteRatio > 1.0: prints.warn( "Elite Ratio is out of required range, defaulting to 0.01") eliteRatio = 0.01 if argument.crossover: crossoverProbability = float(argument.crossover) if crossoverProbability < 0.0 or crossoverProbability > 1.0: prints.warn( "Crossover Probability is out of required range, defaulting to 0.5" ) crossoverProbability = 0.5 if argument.parents: parentsPortion = float(argument.parents) if parentsPortion < 0.0 or parentsPortion > 1.0: prints.warn( "Parents Portion is out of required range, defaulting to 0.3") parentsPortion = 0.3 # when running program in Assignment mode # if output user provided already exists when running in Assignment mode, warn user # or if directory of user provided output does not exist, terminate program if programMode == 'Assignment': if os.path.exists(outputFileName): prints.warn( "output file {0} already exists in the directory and will be overwritten with new assignments." .format(outputFileName)) elif not os.path.isdir(os.path.dirname( os.path.abspath(outputFileName))): prints.err("directory for output file {0} does NOT exist.".format( outputFileName)) return studentsFileName, projectsFileName, settingsFileName, outputFileName
if __name__ == "__main__": # command line parser and error handling studentFile, projectFile, settingFile, outputFile = argumentParser() # load csv files. returns csv file dataframe and final csv filename settingsFileData, settingsFile = csvFileCheck(settingFile) projectsFileData, projectsFile = csvFileCheck(projectFile) studentsFileData, studentsFile = csvFileCheck(studentFile) # terminate program if any errors detected in the csvFileCheck function if errFlag: prints.err( "Program Terminated in command line handler. See messages(s) above for additional information." ) # read, parse, and handle errors of all three csv files # errFlag used if errors are found in the csv files # violating csv files are appended to array to inform user which file(s) the errors came from errFiles = [] if load_csv.settingsHandler(settingsFileData): errFiles.append(settingsFile) if load_csv.projectsHandler(projectsFileData): errFiles.append(projectsFile) if load_csv.studentsHandler(studentsFileData, programMode): errFiles.append(studentsFile) if errFiles: prints.err(