Ejemplo n.º 1
0
def findDuplicateCols(fileData, requiredCol, csvFile):
    # pandas dataframe appends '.1.' to duplicate columns so check if 'requiredCol'.1 exists
    newCol = str(requiredCol) + '.1'
    if newCol in fileData.columns:
        prints.err(
            "{0} column is duplicated in the {1}. Terminating Program".format(
                requiredCol, csvFile))
Ejemplo n.º 2
0
def studentsHandler(studentsFileData, progMode):

    # Series indexed by student
    global studentID
    global studentGPA
    global studentESL
    global studentPriority
    global studentAvoid
    global studentAssignment
    # DataFrame
    global studentChoiceN
    # Scalar
    global numStudents
    global numStudentChoices

    errFlag = False

    def checkExists(dataSeries):
        errNan = False
        for student in range(numStudents):
            # Verify series has no NaN values
            if pd.isna(dataSeries[student]) is True:  # If element empty
                prints.logerr(
                    "Empty element found in {0} column, row {1}".format(
                        dataSeries.name, student))
                errNan = True
        return errNan

    def checkInt(dataSeries):
        errInt = False
        if is_numeric_dtype(dataSeries) is False:  # Look at series label
            errInt = True
            for student in range(numStudents):
                if dataSeries[student].isnumeric() is False and pd.isna(
                        dataSeries[student]) is False:  # Look at element
                    prints.logerr(
                        "Unexpected data found in {0}, row {1} = '{2}'".format(
                            dataSeries.name, student, dataSeries[student]))
        elif is_integer_dtype(dataSeries) is False:
            for student in range(numStudents):
                if not dataSeries[student].is_integer() and pd.isna(
                        dataSeries[student]) is False:
                    prints.logerr(
                        "Unexpected data found in {0}, row {1} = '{2}'".format(
                            dataSeries.name, student, dataSeries[student]))
        return errInt

    def checkFloat(dataSeries):
        errFloat = False
        if is_numeric_dtype(dataSeries) is False:
            errFloat = True
            for student in range(numStudents):
                try:
                    float(dataSeries[student])
                except ValueError:
                    prints.logerr(
                        "Unexpected data found in {0}, row {1} = '{2}'".format(
                            dataSeries.name, student, dataSeries[student]))
        return errFloat

    def checkBool(dataSeries):
        errBool = False
        if is_bool_dtype(dataSeries) is False:
            errBool = True
            for student in range(numStudents):
                val = dataSeries[student]
                if val is not False:
                    val = val.lower()
                if val is not False and val != 'true' and val != 'false':
                    prints.logerr(
                        "Unexpected data found in {0}, row {1} = '{2}'".format(
                            dataSeries.name, student, dataSeries[student]))
        return errBool

    def checkRange(dataSeries, rmin, rmax):
        errRange = False
        for value in dataSeries:
            if (value < rmin) or (value > rmax):
                prints.logerr(
                    '{0} outside of acceptable range for {1}. Accepted values are between {2} - {3}'
                    .format(value, dataSeries.name, rmin, rmax))
                errRange = True
        return errRange

    def checkDup(dataSeries):
        errDup = False
        if dataSeries.duplicated().any():
            errDup = True
            sDuplicates = dataSeries[dataSeries.duplicated()]
            for i in range(len(sDuplicates)):
                prints.logerr("Duplicate {0} found '{1}'".format(
                    dataSeries.name,
                    dataSeries[dataSeries.duplicated()].iloc[i]))
        return errDup

    def match1D(dataSeries,
                parentData):  # dataSeries is being checked against parentData
        errMatch = False
        for student in range(numStudents):
            if pd.isna(dataSeries[student]) is False:  # If element not empty
                sMatch = False
                for j in range(len(
                        parentData)):  # Find matching element in parentData
                    if (dataSeries[student] == parentData[j]):
                        sMatch = True
                        # replace element with parentData index
                        dataSeries.at[student] = j
                        break
                if sMatch is False:
                    prints.logerr(
                        "No match found in {0} column, row {1} = {2:n}".format(
                            dataSeries.name, student, dataSeries[student]))
                    errMatch = True
        return errMatch

    def matchProject2D(dataFrame):
        errProj = False

        # when Pandas finds unexpected data type, elements in dataFrame cast as objects, not int
        for col in dataFrame.columns:
            # Set flag for typecasting when column is not numeric
            numeric = True
            if is_numeric_dtype(dataFrame[col]) is False:
                numeric = False
            for row in dataFrame.index:
                if pd.isna(
                        dataFrame[col][row]) is False:  # If element not empty
                    # Numeric typecasting for when a letter is present in the column
                    if numeric is False:
                        try:
                            dataFrame.at[row, col] = int(dataFrame[col][row])
                        except:
                            prints.logerr(
                                "Unexpected data found in {0}, row {1} = '{2}'"
                                .format(col, row, dataFrame[col][row]))
                            errProj = True

                    for i in range(len(projectIDs)
                                   ):  # Find matching id in global projectIDs
                        sChoiceMatch = False
                        if (dataFrame[col][row] == projectIDs[i]):
                            sChoiceMatch = True
                            # replace project id with project index
                            dataFrame.at[row, col] = i
                            break
                    if sChoiceMatch is False:
                        prints.logerr(
                            "No matching project id found for {0} = '{1}'".
                            format(col, dataFrame[col][row]))
                        errProj = True
        return errProj

    def checkStudentID(studentID):
        errID = False

        if checkExists(studentID):
            errID = True
        if errID is False and checkInt(studentID):
            errID = True
        if errID is False and checkDup(studentID):
            errID = True
        if errID is False and checkRange(studentID, 0, maxValue):
            errID = True

        return errID

    def checkStudentGPA(studentGPA):
        errGPA = False

        if checkExists(studentGPA):
            errGPA = True
        if errGPA is False and checkFloat(studentGPA):
            errGPA = True
        if errGPA is False and checkRange(studentGPA, 0.0, 4.0):
            errGPA = True

        return errGPA

    def checkStudentESL(studentESL):
        errESL = False

        if checkBool(studentESL):
            errESL = True
        return errESL

    def checkStudentPriority(studentPriority):
        errPri = False

        if checkBool(studentPriority):
            errPri = True
        return errPri

    def checkAssignment(studentAssignment):
        errAssign = False

        if checkExists(studentAssignment):
            errAssign = True
        if errAssign is False and checkInt(studentAssignment):
            errAssign = True
        if errAssign is False and match1D(studentAssignment, projectIDs):
            errAssign = True
        return errAssign

    def checkAvoid(studentAvoid):
        errAvoid = False

        if checkInt(studentAvoid):
            errAvoid = True
        if errAvoid is False and match1D(studentAvoid, studentID):
            errAvoid = True
        return errAvoid

    def checkChoices(studentChoiceN):
        errChoice = False

        if checkExists(studentChoiceN['studentChoice1']):
            errChoice = True
        if matchProject2D(studentChoiceN):
            errChoice = True
        return errChoice

    # Verify required columns are present
    requiredColumns = [
        'studentID', 'studentChoice1', 'studentGPA', 'studentESL',
        'studentAvoid', 'studentPriority'
    ]
    for col in requiredColumns:
        if col not in studentsFileData.columns:
            prints.err(
                "Required {0} column header not found in the students csv file. Terminating Program."
                .format(col))
        else:
            findDuplicateCols(studentsFileData, col, 'Students CSV File')

    # Search for sequential studentChoice columns to store in global dataframe
    choiceFields = ['studentChoice1']
    studentsCols = list(studentsFileData)
    for i in range(1, len(studentsFileData.columns)
                   ):  # iterate through each column in the studentsFileData
        sChoiceI = 'studentChoice' + str(i)
        if sChoiceI in studentsCols and i != 1:
            choiceFields.append(sChoiceI)  # Create list of found header names
        elif sChoiceI not in studentsCols:
            break
    # Store studentChoice columns in global dataframe
    studentChoiceN = studentsFileData[choiceFields].copy()

    # Create global series
    studentID = studentsFileData['studentID'].copy()
    studentGPA = studentsFileData['studentGPA'].copy()
    studentESL = studentsFileData['studentESL'].fillna(False)
    studentPriority = studentsFileData['studentPriority'].fillna(False)
    studentAvoid = studentsFileData['studentAvoid'].copy()

    # Define global variable
    numStudents = len(studentsFileData)
    numStudentChoices = len(studentChoiceN.columns)

    # Verify student ID data
    if checkStudentID(studentID):
        errFlag = True
    else:
        # if student ID data is correct verify student avoid data
        if checkAvoid(studentAvoid):
            errFlag = True

    # Verify studentGPA data
    if checkStudentGPA(studentGPA):
        errFlag = True

    # Verify studentESL data
    if checkStudentESL(studentESL):
        errFlag = True

    # Verify student Priority data
    if checkStudentPriority(studentPriority):
        errFlag = True

    # Verify student Choices
    if checkChoices(studentChoiceN):
        errFlag = True

    # Verify assignment column when in assignment mode
    if progMode == 'Scoring':

        if 'assignment' in studentsFileData.columns:

            # Store assignment column
            studentAssignment = studentsFileData['assignment'].copy()
            if checkAssignment(studentAssignment):
                errFlag = True
        else:
            prints.err("No assignment column found. Terminating program.")

    return errFlag
Ejemplo n.º 3
0
def projectsHandler(projectsFileData):
    # arrays indexed by project
    global minTeamSize
    global maxTeamSize
    global projectIDs

    projectsErrFlag = False
    # verify required project csv headers are present and not duplicated
    requiredColumns = ['projectID', 'minTeamSize', 'maxTeamSize']
    for col in requiredColumns:
        if col not in projectsFileData.columns:
            prints.err(
                "Required {0} column not found in the projects csv file. Terminating Program."
                .format(col))
        else:
            findDuplicateCols(projectsFileData, col, 'Projects CSV file')

    # verify that all values in program csv file are integers
    projectIDs, intErr = int_checker('projectID', None, projectsFileData, 1)
    projectsErrFlag = check_error_flag(intErr, projectsErrFlag)

    # verify that there are no duplicate project IDs in the projectID column
    if projectsFileData.projectID.duplicated().any():
        projectsErrFlag = True
        projectDuplicates = projectsFileData[
            projectsFileData.projectID.duplicated()]
        for i in range(len(projectDuplicates)):
            prints.logerr("Duplicate projectID found: {0}".format(
                projectsFileData[
                    projectsFileData.projectID.duplicated()].iloc[i]))

    # if values for team sizes are blank, enter size from settings csv and then verify all values are integers
    projectsFileData['minTeamSize'] = projectsFileData['minTeamSize'].fillna(
        defaultMinTeamSize)
    minTeamSize, intErr = int_checker('minTeamSize', None, projectsFileData, 1)
    projectsErrFlag = check_error_flag(intErr, projectsErrFlag)
    projectsFileData['maxTeamSize'] = projectsFileData['maxTeamSize'].fillna(
        defaultMaxTeamSize)
    maxTeamSize, intErrFlag = int_checker('maxTeamSize', None,
                                          projectsFileData, 1)
    projectsErrFlag = check_error_flag(intErr, projectsErrFlag)

    # verify minTeamSize is not greater than maxTeamSize
    # zip() used to iterate in parallel over multiple iterables
    for minSize, maxSize, pid in zip(minTeamSize, maxTeamSize, projectIDs):
        if minSize > maxSize:
            projectsErrFlag = True
            prints.logerr(
                "minTeamSize is greater than maxTeamSize for projectID {0}.".
                format(pid))

    # warn user if gap found in projectID sequence, assuming projectIDs start at projectID '1'
    # arithmetic series = (n(firstNum + lastNum)) / 2, where n is # of terms in sequence,
    # then subtract real sum of projectIDs
    try:
        projectIDGap = projectIDs[-1] * (projectIDs[0] +
                                         projectIDs[-1]) / 2 - sum(projectIDs)
    except ValueError:
        projectIDGap = 0  # temp value to pass next if statement. Cause of error would have already been identified.
    if not projectIDGap == 0:
        prints.warn(
            "gap found in projectID sequence in the projects csv file.")

    return projectsErrFlag
Ejemplo n.º 4
0
def settingsHandler(settingsFileData):
    global weightMaxLowGPAStudents
    global weightMaxESLStudents
    global weightMaxTeamSize
    global weightMinTeamSize
    global weightStudentPriority
    global weightStudentChoice1
    global weightAvoid
    global effort
    global maxLowGPAStudents
    global maxESLStudents
    global lowGPAThreshold
    global defaultMaxTeamSize
    global defaultMinTeamSize

    settingsErrFlag = False

    # verify required settings csv headers are present and not duplicated
    requiredColumns = ['name', 'min', 'max', 'points']
    for col in requiredColumns:
        if col not in settingsFileData.columns:
            prints.err(
                "Required {0} column header not found in the settings csv. Terminating Program."
                .format(col))
        else:
            findDuplicateCols(settingsFileData, col, 'Settings CSV file')

    # verify required settings csv rows are present and not duplicated
    requiredRows = [
        'teamSize', 'lowGPAThreshold', 'maxLowGPAStudents', 'maxESLStudents',
        'weightMaxLowGPAStudents', 'weightMaxESLStudents', 'weightMinTeamSize',
        'weightMaxTeamSize', 'weightStudentPriority', 'weightStudentChoice1',
        'weightAvoid', 'effort'
    ]
    for row in requiredRows:
        if row not in settingsFileData['name'].values:
            prints.err(
                "Required {0} row not found in the settings csv file. Terminating Program."
                .format(row))
        if len(settingsFileData[settingsFileData['name'] == row]) > 1:
            prints.err(
                "Required {0} row is duplicated in the settings csv file. Terminating Program."
                .format(row))

    # verify required fields in 'points' column contain integers
    # if they are, assign value to global variable for scoring function to use
    weightMaxLowGPAStudents, intErr = int_checker(
        None, 'weightMaxLowGPAStudents',
        (settingsFileData.set_index('name').at['weightMaxLowGPAStudents',
                                               'points']), 0)
    settingsErrFlag = check_error_flag(intErr, settingsErrFlag)
    weightMaxESLStudents, intErr = int_checker(
        None, 'weightMaxESLStudents',
        (settingsFileData.set_index('name').at['weightMaxESLStudents',
                                               'points']), 0)
    settingsErrFlag = check_error_flag(intErr, settingsErrFlag)
    weightMinTeamSize, intErr = int_checker(
        None, 'weightMinTeamSize',
        (settingsFileData.set_index('name').at['weightMinTeamSize', 'points']),
        0)
    settingsErrFlag = check_error_flag(intErr, settingsErrFlag)
    weightMaxTeamSize, intErr = int_checker(
        None, 'weightMaxTeamSize',
        (settingsFileData.set_index('name').at['weightMaxTeamSize', 'points']),
        0)
    settingsErrFlag = check_error_flag(intErr, settingsErrFlag)
    weightStudentPriority, intErr = int_checker(
        None, 'weightStudentPriority',
        (settingsFileData.set_index('name').at['weightStudentPriority',
                                               'points']), 0)
    settingsErrFlag = check_error_flag(intErr, settingsErrFlag)
    weightStudentChoice1, intErr = int_checker(
        None, 'weightStudentChoice1',
        (settingsFileData.set_index('name').at['weightStudentChoice1',
                                               'points']), 0)
    settingsErrFlag = check_error_flag(intErr, settingsErrFlag)
    weightAvoid, intErr = int_checker(
        None, 'weightAvoid',
        (settingsFileData.set_index('name').at['weightAvoid', 'points']), 0)
    settingsErrFlag = check_error_flag(intErr, settingsErrFlag)

    # verify required values in 'max' and 'min' column are integers
    # if they are, assign value to global variable for scoring function to use
    defaultMaxTeamSize, intErr = int_checker(
        None, 'teamSize',
        (settingsFileData.set_index('name').at['teamSize', 'max']), 1)
    settingsErrFlag = check_error_flag(intErr, settingsErrFlag)
    defaultMinTeamSize, intErr = int_checker(
        None, 'teamSize',
        (settingsFileData.set_index('name').at['teamSize', 'min']), 1)
    settingsErrFlag = check_error_flag(intErr, settingsErrFlag)
    maxLowGPAStudents, intErr = int_checker(
        None, 'maxLowGPAStudents',
        (settingsFileData.set_index('name').at['maxLowGPAStudents', 'max']), 1)
    settingsErrFlag = check_error_flag(intErr, settingsErrFlag)
    maxESLStudents, intErr = int_checker(
        None, 'maxESLStudents',
        (settingsFileData.set_index('name').at['maxESLStudents', 'max']), 1)
    settingsErrFlag = check_error_flag(intErr, settingsErrFlag)

    # if effort value is provided in the csv file, verify the value is an integer within the required range
    # if it is not, use default value
    try:
        effort = int(settingsFileData.set_index('name').at['effort', 'max'])
    except ValueError:
        prints.warn(
            "valid 'effort' value not found in the settings csv. Running with default value."
        )
    if effort < minEffort or effort > maxEffort:
        effort = defaultEffort
        prints.warn(
            "'effort' in the settings csv is not an int between 1 and 100. Running with default value of 20."
        )

    # verify that provided lowGPAThreshold is not empty and that it's a float within range
    # If it is, assign value to global variable for scoring function to use
    try:
        lowGPAThreshold = float(
            settingsFileData.set_index('name').at['lowGPAThreshold', 'min'])
    except ValueError:
        prints.logerr("The lowGPAThreshold 'min' value is not a float.")
        settingsErrFlag = True
        lowGPAThreshold = 0  # temp value to pass statement below to allow program to continue checking for errors
    if (lowGPAThreshold < minGPAThreshold) or (
            lowGPAThreshold > maxGPAThreshold) or (pd.isna(lowGPAThreshold)):
        prints.logerr(
            "lowGPAThreshold 'min' setting requires a 0.00 - 4.00 value.")
        settingsErrFlag = True

    return settingsErrFlag
Ejemplo n.º 5
0
def argumentParser():
    global scoreBreakdown
    global programMode

    global mutationProbability
    global populationSize
    global eliteRatio
    global crossoverProbability
    global parentsPortion

    # accepted command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("-s",
                        "--students",
                        help="Students CSV filename",
                        required=False,
                        default='students.csv')
    parser.add_argument("-p",
                        "--projects",
                        help="Projects CSV filename",
                        required=False,
                        default='projects.csv')
    parser.add_argument("-u",
                        "--settings",
                        help="Settings CSV filename",
                        required=False,
                        default='settings.csv')
    parser.add_argument("-o",
                        "--output",
                        help="Output CSV filename",
                        required=False,
                        default='assign.csv')
    parser.add_argument("-a",
                        "--assign",
                        help="Run program in Assignment mode",
                        required=False,
                        action='store_true')
    parser.add_argument("-c",
                        "--score",
                        help="Run the program in Scoring mode",
                        required=False,
                        action='store_true')
    parser.add_argument("-b",
                        "--breakdown",
                        help="Display score breakdown",
                        required=False,
                        action='store_true')
    parser.add_argument("-mutation",
                        "--mutation",
                        help="Mutation Probability",
                        required=False,
                        default=0.02)
    parser.add_argument("-population",
                        "--population",
                        help="Population Size",
                        required=False,
                        default=100)
    parser.add_argument("-elite",
                        "--elite",
                        help="Population Size",
                        required=False,
                        default=0.01)
    parser.add_argument("-crossover",
                        "--crossover",
                        help="Population Size",
                        required=False,
                        default=0.5)
    parser.add_argument("-parents",
                        "--parents",
                        help="Population Size",
                        required=False,
                        default=0.3)

    argument = parser.parse_args()

    # assign csv file names and mode preference
    if argument.students:
        studentsFileName = argument.students
    if argument.projects:
        projectsFileName = argument.projects
    if argument.settings:
        settingsFileName = argument.settings
    if argument.output:
        outputFileName = argument.output
    if argument.score:
        programMode = 'Scoring'
    if argument.assign:
        programMode = 'Assignment'
        if argument.score:
            prints.warn(
                "both Scoring (-c) and Assignment (-a) modes selected. Program will run in Assignment mode."
            )
    if argument.breakdown:
        scoreBreakdown = True
    if argument.mutation:
        mutationProbability = float(argument.mutation)
        if mutationProbability < 0.0 or mutationProbability > 1.0:
            prints.warn(
                "Mutation Probability is out of required range, defaulting to 0.02"
            )
            mutationProbability = 0.02
    if argument.population:
        try:
            populationSize = int(argument.population)
        except ValueError:
            prints.warn("Population Size is not an integer, defaulting to 100")
            populationSize = 100
    if argument.elite:
        eliteRatio = float(argument.elite)
        if eliteRatio < 0.0 or eliteRatio > 1.0:
            prints.warn(
                "Elite Ratio is out of required range, defaulting to 0.01")
            eliteRatio = 0.01
    if argument.crossover:
        crossoverProbability = float(argument.crossover)
        if crossoverProbability < 0.0 or crossoverProbability > 1.0:
            prints.warn(
                "Crossover Probability is out of required range, defaulting to 0.5"
            )
            crossoverProbability = 0.5
    if argument.parents:
        parentsPortion = float(argument.parents)
        if parentsPortion < 0.0 or parentsPortion > 1.0:
            prints.warn(
                "Parents Portion is out of required range, defaulting to 0.3")
            parentsPortion = 0.3

    # when running program in Assignment mode
    # if output user provided already exists when running in Assignment mode, warn user
    # or if directory of user provided output does not exist, terminate program
    if programMode == 'Assignment':
        if os.path.exists(outputFileName):
            prints.warn(
                "output file {0} already exists in the directory and will be overwritten with new assignments."
                .format(outputFileName))
        elif not os.path.isdir(os.path.dirname(
                os.path.abspath(outputFileName))):
            prints.err("directory for output file {0} does NOT exist.".format(
                outputFileName))

    return studentsFileName, projectsFileName, settingsFileName, outputFileName
Ejemplo n.º 6
0

if __name__ == "__main__":

    # command line parser and error handling
    studentFile, projectFile, settingFile, outputFile = argumentParser()

    # load csv files. returns csv file dataframe and final csv filename
    settingsFileData, settingsFile = csvFileCheck(settingFile)
    projectsFileData, projectsFile = csvFileCheck(projectFile)
    studentsFileData, studentsFile = csvFileCheck(studentFile)

    # terminate program if any errors detected in the csvFileCheck function
    if errFlag:
        prints.err(
            "Program Terminated in command line handler. See messages(s) above for additional information."
        )

    # read, parse, and handle errors of all three csv files
    # errFlag used if errors are found in the csv files
    # violating csv files are appended to array to inform user which file(s) the errors came from
    errFiles = []
    if load_csv.settingsHandler(settingsFileData):
        errFiles.append(settingsFile)
    if load_csv.projectsHandler(projectsFileData):
        errFiles.append(projectsFile)
    if load_csv.studentsHandler(studentsFileData, programMode):
        errFiles.append(studentsFile)

    if errFiles:
        prints.err(