def test_CheckIfSameLength(self):
     dbinfo = GetDatabaseInfoScholarshipsWithClassStatuses('Junior')
     descriptionsList = dbinfo.getScholarshipDescriptionsList()
     eligibilitiesList = dbinfo.getEligibilitiesList()
     self.assertIsNotNone(descriptionsList)
     self.assertIsNotNone(eligibilitiesList)
     self.assertEqual(len(descriptionsList), len(eligibilitiesList))
 def test_eligibilitiesList(self):
     dbinfo = GetDatabaseInfoScholarshipsWithClassStatuses('Senior')
     self.assertIsNotNone(dbinfo)
     eligibilitesList = dbinfo.getEligibilitiesList()
     self.assertIsNotNone(eligibilitesList)
     testEligibility = eligibilitesList[0]
     testCleanText = CleanText.cleanALLtheText(testEligibility)
     self.assertIsNotNone(testCleanText)
 def test_scholarshipsDescriptionsList(self):
     dbinfo = GetDatabaseInfoScholarshipsWithClassStatuses('Junior')
     self.assertIsNotNone(dbinfo)
     descriptionsList = dbinfo.getScholarshipDescriptionsList()
     self.assertIsNotNone(descriptionsList)
     testDescription = descriptionsList[0]
     testCleanText = CleanText.cleanALLtheText(testDescription)
     self.assertIsNotNone(testCleanText)
コード例 #4
0
    def test_runEnsembleClassifier(self):
        databaseInfo = GetDatabaseInfoScholarshipsWithClassStatuses()
        dataTextList = databaseInfo.getConcatenatedDescriptionsEligibilities()
        idsList = databaseInfo.getScholarshipsWithClassStatusIdsList()
        ensembleClassifyTest = LogisticRegressionMultilabelClassifyClassStatusFromPretrainedModels(dataTextList,
                                                                                                   idsList)

        ensembleClassifyTest.displayResults()
コード例 #5
0
 def test_OVRClassifier(self):
     dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses(
     ).getConcatenatedDescriptionsEligibilities()
     labelsList = GetDatabaseInfoScholarshipsWithClassStatuses(
     ).getRequirementNeededList()
     idsList = GetDatabaseInfoScholarshipsWithClassStatuses(
     ).getScholarshipsWithClassStatusIdsList()
     testClassify = OneVsRestClassifyPreviouslyUntrained(
         dataTextList, labelsList, idsList, trainingPercentage=0.8)
     testClassify.trainTestAndDisplayResults()
コード例 #6
0
    def test_makeTrainingTestingSetBadPercentage(self):
        desiredClassStatus = 'Senior'
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=desiredClassStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=desiredClassStatus, useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )

        trainingPercentage = 2
        trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=desiredClassStatus,
            secondLabel='Other',
            firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList,
            firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds,
            trainingPercentage=trainingPercentage)

        self.assertIsNone(trainingSetAndTestingSet)
    def __init__(self):
        databaseInfo = GetDatabaseInfoScholarshipsWithClassStatuses()
        self.actualLabels = databaseInfo.getRequirementNeededList()
        self.predictedLabels = databaseInfo.getEnsembleClassifierPredictions()

        self.actualLabels = self.convertActualLabelStringsToList()
        self.predictedLabels = self.convertPredictedLabelStringsToList()

        self.calculateExactMatchAccuracy()
        self.calculateNumLabelsAccuracy()
        print("Average Accuracy Across Labels: %.2f percent" %
              self.calculateAverageAccuracyWithinLists(self.actualLabels,
                                                       self.predictedLabels))
        self.calculateListAccuracyWithinMatchedGreaterOrLesserListLengths()
コード例 #8
0
    def test_makeTrainingTestingSets(self):
        desiredClassStatus = 'Junior'
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=desiredClassStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=desiredClassStatus, useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )

        trainingPercentage = 0.9
        trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=desiredClassStatus,
            secondLabel='Other',
            firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList,
            firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds,
            trainingPercentage=trainingPercentage)
        training = trainingSetAndTestingSet[0]
        testing = trainingSetAndTestingSet[1]
        totalSetSize = len(training) + len(testing)
        self.assertEqual(math.ceil(totalSetSize * trainingPercentage),
                         len(training))
        self.assertEqual((len(goodClassStatusDataTextList) +
                          len(badClassStatusDataTextList)), totalSetSize)
コード例 #9
0
    def test_makeMultilabelTrainingAndTestingSets(self):
        dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getConcatenatedDescriptionsEligibilities()
        labelsList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getRequirementNeededList()
        idsList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getScholarshipsWithClassStatusIdsList()
        trainingPercentage = 0.9

        trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet(
            dataTextList, labelsList, idsList, trainingPercentage)
        totalSetSize = len(trainingSet) + len(testingSet)

        self.assertEqual(math.ceil(totalSetSize * trainingPercentage),
                         len(trainingSet))
コード例 #10
0
    def test_makeSureTheLabelsListIsAListOfListsOfStrings(self):
        dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getConcatenatedDescriptionsEligibilities()
        labelsList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getRequirementNeededList()
        idsList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getScholarshipsWithClassStatusIdsList()
        testClassify = OneVsRestClassifyPreviouslyUntrained(
            dataTextList, labelsList, idsList, trainingPercentage=0.8)

        trainingSet = testClassify.trainingSet
        trainingLabels = [training['label'] for training in trainingSet]
        self.assertEqual(type(trainingLabels), list)
        self.assertEqual(type(trainingLabels[1]), list)
        self.assertEqual(type(trainingLabels[1][0]), str)
    def test_CheckLabelsListFormat(self):
        # get the list of labels, 100
        dbinfo = GetDatabaseInfoScholarshipsWithClassStatuses()
        listOLabels = dbinfo.getRequirementNeededList()
        self.assertEqual(type(listOLabels), list)
        self.assertEqual(type(listOLabels[1]), list)
        first100Labels = listOLabels[:100]
        self.assertEqual(len(first100Labels), 100)

        # get fake data
        X, Y = make_multilabel_classification(n_classes=10, n_labels=3, allow_unlabeled=False)

        # test the OVR
        testClassifiier = OneVsRestClassifier(LogisticRegression())
        testClassifiier.fit(X, first100Labels)
コード例 #12
0
    def test_SeeWhatTheMultilabelTrainingSetIsActuallyPassingAsLabels(self):
        dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getConcatenatedDescriptionsEligibilities()
        labelsList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getRequirementNeededList()
        idsList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getScholarshipsWithClassStatusIdsList()
        trainingPercentage = 0.9

        trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet(
            dataTextList, labelsList, idsList, trainingPercentage)

        trainingLabels = [
            trainingInstance['label'] for trainingInstance in trainingSet
        ]
        self.assertEqual(type(trainingLabels[1]), list)
        self.assertEqual(type(trainingLabels[1][0]), str)
コード例 #13
0
    def test_trainOVRLRModel(self):
        modelSaveFile = 'OneVsRestLRTrainedClassifiers\OneVsRestLRTrainedModel'
        featuresValueCountsSaveFile = 'OneVsRestLRTrainedClassifiers\OneVsRestLRTrainedFeaturesValueCounts'

        # first check to see if can open files:
        testOpenModelSaveFile = open(modelSaveFile, 'rb')
        testOpenModelSaveFile.close()
        testOpenFVCSaveFile = open(featuresValueCountsSaveFile, 'rb')
        testOpenFVCSaveFile.close()

        # do training and save to files
        dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses().getConcatenatedDescriptionsEligibilities()
        labelsList = GetDatabaseInfoScholarshipsWithClassStatuses().getRequirementNeededList()
        idsList = GetDatabaseInfoScholarshipsWithClassStatuses().getScholarshipsWithClassStatusIdsList()
        testClassify = OneVsRestClassifyPreviouslyUntrained(dataTextList, labelsList, idsList,
                                                            trainingPercentage=0.99)
        testClassify.trainAndSaveOVRModel(modelSaveFile, featuresValueCountsSaveFile)
コード例 #14
0
    def test_RunEnsembleClassifierInsertResultsIntoDB(self):
        # get data from db
        dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses().getConcatenatedDescriptionsEligibilities()
        idsList = GetDatabaseInfoScholarshipsWithClassStatuses().getScholarshipsWithClassStatusIdsList()

        # run classifier, return results
        pretrainedOVRLRModelFilePath = '..\OneVsRestLRTrainedClassifiers\OneVsRestLRTrainedModel'
        pretrainedOVRLRFeaturesValueCountsIndexesFilePath = '..\OneVsRestLRTrainedClassifiers\OneVsRestLRTrainedFeaturesValueCounts'
        testClassifier = OneVsRestClassifyFromPretrainedModel(pretrainedOVRLRModelFilePath,
                                                              pretrainedOVRLRFeaturesValueCountsIndexesFilePath,
                                                              dataTextList, idsList)
        predictions = testClassifier.getPredictions()

        # insert results into db
        db = SUDBConnect()
        for scholarshipWithClassStatusID, prediction in zip(idsList, predictions):
            prediction = ', '.join(prediction)
            db.insertUpdateOrDeleteDB(
                "update dbo.ScholarshipsWithClassStatuses set OneVsRestClassifierPrediction='" + prediction + "' where ScholarshipsWithClassStatusId = '" + str(
                    scholarshipWithClassStatusID) + "'")
    def test_runModelOnFreshman(self):
        classStatus = 'Freshman'
        modelSaveFile = '..\ClassifierTrainedModels\%sClassStatusTrainedLRModel' % classStatus
        featuresValueCountsSaveFile = '..\ClassifierTrainedModels\%sClassStatusTrainedFeaturesValueCounts' % classStatus

        # test to make sure can open model and fvc save files:
        testModelSaveOpen = open(modelSaveFile, 'rb')
        testModelSaveOpen.close()
        testFVCSaveOpen = open(featuresValueCountsSaveFile, 'rb')
        testFVCSaveOpen.close()

        # now do test
        databaseInfo = GetDatabaseInfoScholarshipsWithClassStatuses()
        dataTextList = databaseInfo.getConcatenatedDescriptionsEligibilities()
        idsList = databaseInfo.getScholarshipsWithClassStatusIdsList()
        testClassify = LogisticRegressionClassifyClassStatusFromPretrainedModel(trainedModelInputFile=modelSaveFile,
                                                              trainedFeaturesValueCountsIndexesFile=featuresValueCountsSaveFile,
                                                              testingDataTextList=dataTextList,
                                                              testingDataIdsList=idsList)
        testClassify.displayResults()
コード例 #16
0
    def test_RunEnsembleClassifierInsertResultsIntoDB(self):
        # get data from db
        databaseInfoExtract = GetDatabaseInfoScholarshipsWithClassStatuses()
        dataTextList = databaseInfoExtract.getConcatenatedDescriptionsEligibilities()
        idsList = databaseInfoExtract.getScholarshipsWithClassStatusIdsList()

        # run classifier, return results
        ensembleClassifyTest = LogisticRegressionMultilabelClassifyClassStatusFromPretrainedModels(dataTextList,
                                                                                                   idsList)
        idsAndPredictionsList = ensembleClassifyTest.doAllClassificationsAndReturnFilteredPredictionsListsById()

        # insert results into db
        db = SUDBConnect()
        ids = [prediction[0] for prediction in idsAndPredictionsList]
        predictions = [prediction[1] for prediction in idsAndPredictionsList]
        for scholarshipWithClassStatusID, prediction in zip(ids, predictions):
            prediction = ', '.join(prediction)
            db.insertUpdateOrDeleteDB(
                "update dbo.ScholarshipsWithClassStatuses set EnsembleClassifierPrediction='" + prediction + "' where ScholarshipsWithClassStatusId = '" + str(
                    scholarshipWithClassStatusID) + "'")
コード例 #17
0
    def test_runModelOnFreshman(self):
        classStatus = 'Freshman'
        modelSaveFile = '..\ClassifierTrainedModels\%sClassStatusTrainedLRModel' % classStatus
        featuresValueCountsSaveFile = '..\ClassifierTrainedModels\%sClassStatusTrainedFeaturesValueCounts' % classStatus

        # test to make sure can open model and fvc save files:
        testModelSaveOpen = open(modelSaveFile, 'rb')
        testModelSaveOpen.close()
        testFVCSaveOpen = open(featuresValueCountsSaveFile, 'rb')
        testFVCSaveOpen.close()

        # now do test
        databaseInfo = GetDatabaseInfoScholarshipsWithClassStatuses()
        dataTextList = databaseInfo.getConcatenatedDescriptionsEligibilities()
        idsList = databaseInfo.getScholarshipsWithClassStatusIdsList()
        testClassify = LogisticRegressionClassifyClassStatusFromPretrainedModel(
            trainedModelInputFile=modelSaveFile,
            trainedFeaturesValueCountsIndexesFile=featuresValueCountsSaveFile,
            testingDataTextList=dataTextList,
            testingDataIdsList=idsList)
        testClassify.displayResults()
    def __init__(self, classStatus, trainingPercentage, modelSaveFile=None, featuresValuesCountsSaveFile=None):
        self.classStatus = classStatus
        self.trainingPercentage = trainingPercentage
        self.badLabel = 'Other'
        self.modelSaveFile = modelSaveFile
        self.featuresValueCountsSaveFile = featuresValuesCountsSaveFile

        print('Creating datasets...')
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=self.classStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=self.classStatus, useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList()
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities()
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList()
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities()

        trainingTestingList = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=self.classStatus, secondLabel=self.badLabel, firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds, trainingPercentage=0.8)
        self.training = trainingTestingList[0]
        self.testing = trainingTestingList[1]

        self.dataFrame = self.makeDataFrame()

        self.trainingVectors = []
        self.testingVectors = []

        self.featuresValueCountsIndexes = []
        self.logisticRegressionClassifier = LogisticRegression()
コード例 #19
0
    def __init__(self,
                 classStatus,
                 trainingPercentage,
                 modelSaveFile=None,
                 featuresValuesCountsSaveFile=None):
        self.classStatus = classStatus
        self.trainingPercentage = trainingPercentage
        self.badLabel = 'Other'
        self.modelSaveFile = modelSaveFile
        self.featuresValueCountsSaveFile = featuresValuesCountsSaveFile

        print('Creating datasets...')
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=self.classStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=self.classStatus, useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )

        trainingTestingList = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=self.classStatus,
            secondLabel=self.badLabel,
            firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList,
            firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds,
            trainingPercentage=0.8)
        self.training = trainingTestingList[0]
        self.testing = trainingTestingList[1]

        self.dataFrame = self.makeDataFrame()

        self.trainingVectors = []
        self.testingVectors = []

        self.featuresValueCountsIndexes = []
        self.logisticRegressionClassifier = LogisticRegression()
コード例 #20
0
    def test_makeTrainingTestingSetBadPercentage(self):
        desiredClassStatus = 'Senior'
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus,
                                                                        useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList()
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities()
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList()
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities()

        trainingPercentage = 2
        trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage)

        self.assertIsNone(trainingSetAndTestingSet)
コード例 #21
0
    def test_makeTrainingTestingSets(self):
        desiredClassStatus = 'Junior'
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus,
                                                                        useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList()
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities()
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList()
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities()

        trainingPercentage = 0.9
        trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage)
        training = trainingSetAndTestingSet[0]
        testing = trainingSetAndTestingSet[1]
        totalSetSize = len(training) + len(testing)
        self.assertEqual(math.ceil(totalSetSize * trainingPercentage), len(training))
        self.assertEqual((len(goodClassStatusDataTextList) + len(badClassStatusDataTextList)), totalSetSize)