def check_range(value, location, minValue): if (value < minValue) or (value > maxValue): prints.logerr( "Value {0} in {1} is an integer out of the required range.". format(value, location)) return True return False
def isinstance_float(value, location, minValue): if isinstance(value, float): if value.is_integer(): tempInt = int(value) return True, tempInt, check_range(value, location, minValue) prints.logerr("Value {0} in {1} is not a required int.".format( value, location)) return True, value, True return False, value, False
def checkRange(dataSeries, rmin, rmax): errRange = False for value in dataSeries: if (value < rmin) or (value > rmax): prints.logerr( '{0} outside of acceptable range for {1}. Accepted values are between {2} - {3}' .format(value, dataSeries.name, rmin, rmax)) errRange = True return errRange
def isinstance_obj(value, location, minValue): try: tempFloat = float(value) except ValueError: prints.logerr("{0} in {1} is not an integer.".format( value, location)) return True, 0, True isObj, value, intErrFlag = isinstance_float(tempFloat, location, minValue) return isObj, value, intErrFlag
def checkDup(dataSeries): errDup = False if dataSeries.duplicated().any(): errDup = True sDuplicates = dataSeries[dataSeries.duplicated()] for i in range(len(sDuplicates)): prints.logerr("Duplicate {0} found '{1}'".format( dataSeries.name, dataSeries[dataSeries.duplicated()].iloc[i])) return errDup
def checkExists(dataSeries): errNan = False for student in range(numStudents): # Verify series has no NaN values if pd.isna(dataSeries[student]) is True: # If element empty prints.logerr( "Empty element found in {0} column, row {1}".format( dataSeries.name, student)) errNan = True return errNan
def checkFloat(dataSeries): errFloat = False if is_numeric_dtype(dataSeries) is False: errFloat = True for student in range(numStudents): try: float(dataSeries[student]) except ValueError: prints.logerr( "Unexpected data found in {0}, row {1} = '{2}'".format( dataSeries.name, student, dataSeries[student])) return errFloat
def checkBool(dataSeries): errBool = False if is_bool_dtype(dataSeries) is False: errBool = True for student in range(numStudents): val = dataSeries[student] if val is not False: val = val.lower() if val is not False and val != 'true' and val != 'false': prints.logerr( "Unexpected data found in {0}, row {1} = '{2}'".format( dataSeries.name, student, dataSeries[student])) return errBool
def checkInt(dataSeries): errInt = False if is_numeric_dtype(dataSeries) is False: # Look at series label errInt = True for student in range(numStudents): if dataSeries[student].isnumeric() is False and pd.isna( dataSeries[student]) is False: # Look at element prints.logerr( "Unexpected data found in {0}, row {1} = '{2}'".format( dataSeries.name, student, dataSeries[student])) elif is_integer_dtype(dataSeries) is False: for student in range(numStudents): if not dataSeries[student].is_integer() and pd.isna( dataSeries[student]) is False: prints.logerr( "Unexpected data found in {0}, row {1} = '{2}'".format( dataSeries.name, student, dataSeries[student])) return errInt
def csvFileCheck(csvFileName): global errFlag if not os.path.exists(csvFileName): # if original filename not found, add .csv extension and check again tempFileName = csvFileName + '.csv' if not os.path.exists(tempFileName): prints.logerr("{0} csv file can not be found.".format(csvFileName)) errFlag = True return 0, 0 csvFileName = tempFileName try: tempDataStruct = pd.read_csv(csvFileName) except ValueError: prints.logerr("{0} is not a valid csv file.".format(csvFileName)) errFlag = True return 0, 0 return tempDataStruct, csvFileName
def match1D(dataSeries, parentData): # dataSeries is being checked against parentData errMatch = False for student in range(numStudents): if pd.isna(dataSeries[student]) is False: # If element not empty sMatch = False for j in range(len( parentData)): # Find matching element in parentData if (dataSeries[student] == parentData[j]): sMatch = True # replace element with parentData index dataSeries.at[student] = j break if sMatch is False: prints.logerr( "No match found in {0} column, row {1} = {2:n}".format( dataSeries.name, student, dataSeries[student])) errMatch = True return errMatch
def matchProject2D(dataFrame): errProj = False # when Pandas finds unexpected data type, elements in dataFrame cast as objects, not int for col in dataFrame.columns: # Set flag for typecasting when column is not numeric numeric = True if is_numeric_dtype(dataFrame[col]) is False: numeric = False for row in dataFrame.index: if pd.isna( dataFrame[col][row]) is False: # If element not empty # Numeric typecasting for when a letter is present in the column if numeric is False: try: dataFrame.at[row, col] = int(dataFrame[col][row]) except: prints.logerr( "Unexpected data found in {0}, row {1} = '{2}'" .format(col, row, dataFrame[col][row])) errProj = True for i in range(len(projectIDs) ): # Find matching id in global projectIDs sChoiceMatch = False if (dataFrame[col][row] == projectIDs[i]): sChoiceMatch = True # replace project id with project index dataFrame.at[row, col] = i break if sChoiceMatch is False: prints.logerr( "No matching project id found for {0} = '{1}'". format(col, dataFrame[col][row])) errProj = True return errProj
def projectsHandler(projectsFileData): # arrays indexed by project global minTeamSize global maxTeamSize global projectIDs projectsErrFlag = False # verify required project csv headers are present and not duplicated requiredColumns = ['projectID', 'minTeamSize', 'maxTeamSize'] for col in requiredColumns: if col not in projectsFileData.columns: prints.err( "Required {0} column not found in the projects csv file. Terminating Program." .format(col)) else: findDuplicateCols(projectsFileData, col, 'Projects CSV file') # verify that all values in program csv file are integers projectIDs, intErr = int_checker('projectID', None, projectsFileData, 1) projectsErrFlag = check_error_flag(intErr, projectsErrFlag) # verify that there are no duplicate project IDs in the projectID column if projectsFileData.projectID.duplicated().any(): projectsErrFlag = True projectDuplicates = projectsFileData[ projectsFileData.projectID.duplicated()] for i in range(len(projectDuplicates)): prints.logerr("Duplicate projectID found: {0}".format( projectsFileData[ projectsFileData.projectID.duplicated()].iloc[i])) # if values for team sizes are blank, enter size from settings csv and then verify all values are integers projectsFileData['minTeamSize'] = projectsFileData['minTeamSize'].fillna( defaultMinTeamSize) minTeamSize, intErr = int_checker('minTeamSize', None, projectsFileData, 1) projectsErrFlag = check_error_flag(intErr, projectsErrFlag) projectsFileData['maxTeamSize'] = projectsFileData['maxTeamSize'].fillna( defaultMaxTeamSize) maxTeamSize, intErrFlag = int_checker('maxTeamSize', None, projectsFileData, 1) projectsErrFlag = check_error_flag(intErr, projectsErrFlag) # verify minTeamSize is not greater than maxTeamSize # zip() used to iterate in parallel over multiple iterables for minSize, maxSize, pid in zip(minTeamSize, maxTeamSize, projectIDs): if minSize > maxSize: projectsErrFlag = True prints.logerr( "minTeamSize is greater than maxTeamSize for projectID {0}.". format(pid)) # warn user if gap found in projectID sequence, assuming projectIDs start at projectID '1' # arithmetic series = (n(firstNum + lastNum)) / 2, where n is # of terms in sequence, # then subtract real sum of projectIDs try: projectIDGap = projectIDs[-1] * (projectIDs[0] + projectIDs[-1]) / 2 - sum(projectIDs) except ValueError: projectIDGap = 0 # temp value to pass next if statement. Cause of error would have already been identified. if not projectIDGap == 0: prints.warn( "gap found in projectID sequence in the projects csv file.") return projectsErrFlag
def settingsHandler(settingsFileData): global weightMaxLowGPAStudents global weightMaxESLStudents global weightMaxTeamSize global weightMinTeamSize global weightStudentPriority global weightStudentChoice1 global weightAvoid global effort global maxLowGPAStudents global maxESLStudents global lowGPAThreshold global defaultMaxTeamSize global defaultMinTeamSize settingsErrFlag = False # verify required settings csv headers are present and not duplicated requiredColumns = ['name', 'min', 'max', 'points'] for col in requiredColumns: if col not in settingsFileData.columns: prints.err( "Required {0} column header not found in the settings csv. Terminating Program." .format(col)) else: findDuplicateCols(settingsFileData, col, 'Settings CSV file') # verify required settings csv rows are present and not duplicated requiredRows = [ 'teamSize', 'lowGPAThreshold', 'maxLowGPAStudents', 'maxESLStudents', 'weightMaxLowGPAStudents', 'weightMaxESLStudents', 'weightMinTeamSize', 'weightMaxTeamSize', 'weightStudentPriority', 'weightStudentChoice1', 'weightAvoid', 'effort' ] for row in requiredRows: if row not in settingsFileData['name'].values: prints.err( "Required {0} row not found in the settings csv file. Terminating Program." .format(row)) if len(settingsFileData[settingsFileData['name'] == row]) > 1: prints.err( "Required {0} row is duplicated in the settings csv file. Terminating Program." .format(row)) # verify required fields in 'points' column contain integers # if they are, assign value to global variable for scoring function to use weightMaxLowGPAStudents, intErr = int_checker( None, 'weightMaxLowGPAStudents', (settingsFileData.set_index('name').at['weightMaxLowGPAStudents', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightMaxESLStudents, intErr = int_checker( None, 'weightMaxESLStudents', (settingsFileData.set_index('name').at['weightMaxESLStudents', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightMinTeamSize, intErr = int_checker( None, 'weightMinTeamSize', (settingsFileData.set_index('name').at['weightMinTeamSize', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightMaxTeamSize, intErr = int_checker( None, 'weightMaxTeamSize', (settingsFileData.set_index('name').at['weightMaxTeamSize', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightStudentPriority, intErr = int_checker( None, 'weightStudentPriority', (settingsFileData.set_index('name').at['weightStudentPriority', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightStudentChoice1, intErr = int_checker( None, 'weightStudentChoice1', (settingsFileData.set_index('name').at['weightStudentChoice1', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) weightAvoid, intErr = int_checker( None, 'weightAvoid', (settingsFileData.set_index('name').at['weightAvoid', 'points']), 0) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) # verify required values in 'max' and 'min' column are integers # if they are, assign value to global variable for scoring function to use defaultMaxTeamSize, intErr = int_checker( None, 'teamSize', (settingsFileData.set_index('name').at['teamSize', 'max']), 1) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) defaultMinTeamSize, intErr = int_checker( None, 'teamSize', (settingsFileData.set_index('name').at['teamSize', 'min']), 1) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) maxLowGPAStudents, intErr = int_checker( None, 'maxLowGPAStudents', (settingsFileData.set_index('name').at['maxLowGPAStudents', 'max']), 1) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) maxESLStudents, intErr = int_checker( None, 'maxESLStudents', (settingsFileData.set_index('name').at['maxESLStudents', 'max']), 1) settingsErrFlag = check_error_flag(intErr, settingsErrFlag) # if effort value is provided in the csv file, verify the value is an integer within the required range # if it is not, use default value try: effort = int(settingsFileData.set_index('name').at['effort', 'max']) except ValueError: prints.warn( "valid 'effort' value not found in the settings csv. Running with default value." ) if effort < minEffort or effort > maxEffort: effort = defaultEffort prints.warn( "'effort' in the settings csv is not an int between 1 and 100. Running with default value of 20." ) # verify that provided lowGPAThreshold is not empty and that it's a float within range # If it is, assign value to global variable for scoring function to use try: lowGPAThreshold = float( settingsFileData.set_index('name').at['lowGPAThreshold', 'min']) except ValueError: prints.logerr("The lowGPAThreshold 'min' value is not a float.") settingsErrFlag = True lowGPAThreshold = 0 # temp value to pass statement below to allow program to continue checking for errors if (lowGPAThreshold < minGPAThreshold) or ( lowGPAThreshold > maxGPAThreshold) or (pd.isna(lowGPAThreshold)): prints.logerr( "lowGPAThreshold 'min' setting requires a 0.00 - 4.00 value.") settingsErrFlag = True return settingsErrFlag