def test_CheckIfSameLength(self): dbinfo = GetDatabaseInfoScholarshipsWithClassStatuses('Junior') descriptionsList = dbinfo.getScholarshipDescriptionsList() eligibilitiesList = dbinfo.getEligibilitiesList() self.assertIsNotNone(descriptionsList) self.assertIsNotNone(eligibilitiesList) self.assertEqual(len(descriptionsList), len(eligibilitiesList))
def test_eligibilitiesList(self): dbinfo = GetDatabaseInfoScholarshipsWithClassStatuses('Senior') self.assertIsNotNone(dbinfo) eligibilitesList = dbinfo.getEligibilitiesList() self.assertIsNotNone(eligibilitesList) testEligibility = eligibilitesList[0] testCleanText = CleanText.cleanALLtheText(testEligibility) self.assertIsNotNone(testCleanText)
def test_scholarshipsDescriptionsList(self): dbinfo = GetDatabaseInfoScholarshipsWithClassStatuses('Junior') self.assertIsNotNone(dbinfo) descriptionsList = dbinfo.getScholarshipDescriptionsList() self.assertIsNotNone(descriptionsList) testDescription = descriptionsList[0] testCleanText = CleanText.cleanALLtheText(testDescription) self.assertIsNotNone(testCleanText)
def test_runEnsembleClassifier(self): databaseInfo = GetDatabaseInfoScholarshipsWithClassStatuses() dataTextList = databaseInfo.getConcatenatedDescriptionsEligibilities() idsList = databaseInfo.getScholarshipsWithClassStatusIdsList() ensembleClassifyTest = LogisticRegressionMultilabelClassifyClassStatusFromPretrainedModels(dataTextList, idsList) ensembleClassifyTest.displayResults()
def test_OVRClassifier(self): dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getConcatenatedDescriptionsEligibilities() labelsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getRequirementNeededList() idsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getScholarshipsWithClassStatusIdsList() testClassify = OneVsRestClassifyPreviouslyUntrained( dataTextList, labelsList, idsList, trainingPercentage=0.8) testClassify.trainTestAndDisplayResults()
def test_makeTrainingTestingSetBadPercentage(self): desiredClassStatus = 'Senior' goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=desiredClassStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=desiredClassStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList( ) goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities( ) badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList( ) badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities( ) trainingPercentage = 2 trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage) self.assertIsNone(trainingSetAndTestingSet)
def __init__(self): databaseInfo = GetDatabaseInfoScholarshipsWithClassStatuses() self.actualLabels = databaseInfo.getRequirementNeededList() self.predictedLabels = databaseInfo.getEnsembleClassifierPredictions() self.actualLabels = self.convertActualLabelStringsToList() self.predictedLabels = self.convertPredictedLabelStringsToList() self.calculateExactMatchAccuracy() self.calculateNumLabelsAccuracy() print("Average Accuracy Across Labels: %.2f percent" % self.calculateAverageAccuracyWithinLists(self.actualLabels, self.predictedLabels)) self.calculateListAccuracyWithinMatchedGreaterOrLesserListLengths()
def test_makeTrainingTestingSets(self): desiredClassStatus = 'Junior' goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=desiredClassStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=desiredClassStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList( ) goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities( ) badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList( ) badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities( ) trainingPercentage = 0.9 trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage) training = trainingSetAndTestingSet[0] testing = trainingSetAndTestingSet[1] totalSetSize = len(training) + len(testing) self.assertEqual(math.ceil(totalSetSize * trainingPercentage), len(training)) self.assertEqual((len(goodClassStatusDataTextList) + len(badClassStatusDataTextList)), totalSetSize)
def test_makeMultilabelTrainingAndTestingSets(self): dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getConcatenatedDescriptionsEligibilities() labelsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getRequirementNeededList() idsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getScholarshipsWithClassStatusIdsList() trainingPercentage = 0.9 trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet( dataTextList, labelsList, idsList, trainingPercentage) totalSetSize = len(trainingSet) + len(testingSet) self.assertEqual(math.ceil(totalSetSize * trainingPercentage), len(trainingSet))
def test_makeSureTheLabelsListIsAListOfListsOfStrings(self): dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getConcatenatedDescriptionsEligibilities() labelsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getRequirementNeededList() idsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getScholarshipsWithClassStatusIdsList() testClassify = OneVsRestClassifyPreviouslyUntrained( dataTextList, labelsList, idsList, trainingPercentage=0.8) trainingSet = testClassify.trainingSet trainingLabels = [training['label'] for training in trainingSet] self.assertEqual(type(trainingLabels), list) self.assertEqual(type(trainingLabels[1]), list) self.assertEqual(type(trainingLabels[1][0]), str)
def test_CheckLabelsListFormat(self): # get the list of labels, 100 dbinfo = GetDatabaseInfoScholarshipsWithClassStatuses() listOLabels = dbinfo.getRequirementNeededList() self.assertEqual(type(listOLabels), list) self.assertEqual(type(listOLabels[1]), list) first100Labels = listOLabels[:100] self.assertEqual(len(first100Labels), 100) # get fake data X, Y = make_multilabel_classification(n_classes=10, n_labels=3, allow_unlabeled=False) # test the OVR testClassifiier = OneVsRestClassifier(LogisticRegression()) testClassifiier.fit(X, first100Labels)
def test_SeeWhatTheMultilabelTrainingSetIsActuallyPassingAsLabels(self): dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getConcatenatedDescriptionsEligibilities() labelsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getRequirementNeededList() idsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getScholarshipsWithClassStatusIdsList() trainingPercentage = 0.9 trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet( dataTextList, labelsList, idsList, trainingPercentage) trainingLabels = [ trainingInstance['label'] for trainingInstance in trainingSet ] self.assertEqual(type(trainingLabels[1]), list) self.assertEqual(type(trainingLabels[1][0]), str)
def test_trainOVRLRModel(self): modelSaveFile = 'OneVsRestLRTrainedClassifiers\OneVsRestLRTrainedModel' featuresValueCountsSaveFile = 'OneVsRestLRTrainedClassifiers\OneVsRestLRTrainedFeaturesValueCounts' # first check to see if can open files: testOpenModelSaveFile = open(modelSaveFile, 'rb') testOpenModelSaveFile.close() testOpenFVCSaveFile = open(featuresValueCountsSaveFile, 'rb') testOpenFVCSaveFile.close() # do training and save to files dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses().getConcatenatedDescriptionsEligibilities() labelsList = GetDatabaseInfoScholarshipsWithClassStatuses().getRequirementNeededList() idsList = GetDatabaseInfoScholarshipsWithClassStatuses().getScholarshipsWithClassStatusIdsList() testClassify = OneVsRestClassifyPreviouslyUntrained(dataTextList, labelsList, idsList, trainingPercentage=0.99) testClassify.trainAndSaveOVRModel(modelSaveFile, featuresValueCountsSaveFile)
def test_RunEnsembleClassifierInsertResultsIntoDB(self): # get data from db dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses().getConcatenatedDescriptionsEligibilities() idsList = GetDatabaseInfoScholarshipsWithClassStatuses().getScholarshipsWithClassStatusIdsList() # run classifier, return results pretrainedOVRLRModelFilePath = '..\OneVsRestLRTrainedClassifiers\OneVsRestLRTrainedModel' pretrainedOVRLRFeaturesValueCountsIndexesFilePath = '..\OneVsRestLRTrainedClassifiers\OneVsRestLRTrainedFeaturesValueCounts' testClassifier = OneVsRestClassifyFromPretrainedModel(pretrainedOVRLRModelFilePath, pretrainedOVRLRFeaturesValueCountsIndexesFilePath, dataTextList, idsList) predictions = testClassifier.getPredictions() # insert results into db db = SUDBConnect() for scholarshipWithClassStatusID, prediction in zip(idsList, predictions): prediction = ', '.join(prediction) db.insertUpdateOrDeleteDB( "update dbo.ScholarshipsWithClassStatuses set OneVsRestClassifierPrediction='" + prediction + "' where ScholarshipsWithClassStatusId = '" + str( scholarshipWithClassStatusID) + "'")
def test_runModelOnFreshman(self): classStatus = 'Freshman' modelSaveFile = '..\ClassifierTrainedModels\%sClassStatusTrainedLRModel' % classStatus featuresValueCountsSaveFile = '..\ClassifierTrainedModels\%sClassStatusTrainedFeaturesValueCounts' % classStatus # test to make sure can open model and fvc save files: testModelSaveOpen = open(modelSaveFile, 'rb') testModelSaveOpen.close() testFVCSaveOpen = open(featuresValueCountsSaveFile, 'rb') testFVCSaveOpen.close() # now do test databaseInfo = GetDatabaseInfoScholarshipsWithClassStatuses() dataTextList = databaseInfo.getConcatenatedDescriptionsEligibilities() idsList = databaseInfo.getScholarshipsWithClassStatusIdsList() testClassify = LogisticRegressionClassifyClassStatusFromPretrainedModel(trainedModelInputFile=modelSaveFile, trainedFeaturesValueCountsIndexesFile=featuresValueCountsSaveFile, testingDataTextList=dataTextList, testingDataIdsList=idsList) testClassify.displayResults()
def test_RunEnsembleClassifierInsertResultsIntoDB(self): # get data from db databaseInfoExtract = GetDatabaseInfoScholarshipsWithClassStatuses() dataTextList = databaseInfoExtract.getConcatenatedDescriptionsEligibilities() idsList = databaseInfoExtract.getScholarshipsWithClassStatusIdsList() # run classifier, return results ensembleClassifyTest = LogisticRegressionMultilabelClassifyClassStatusFromPretrainedModels(dataTextList, idsList) idsAndPredictionsList = ensembleClassifyTest.doAllClassificationsAndReturnFilteredPredictionsListsById() # insert results into db db = SUDBConnect() ids = [prediction[0] for prediction in idsAndPredictionsList] predictions = [prediction[1] for prediction in idsAndPredictionsList] for scholarshipWithClassStatusID, prediction in zip(ids, predictions): prediction = ', '.join(prediction) db.insertUpdateOrDeleteDB( "update dbo.ScholarshipsWithClassStatuses set EnsembleClassifierPrediction='" + prediction + "' where ScholarshipsWithClassStatusId = '" + str( scholarshipWithClassStatusID) + "'")
def test_runModelOnFreshman(self): classStatus = 'Freshman' modelSaveFile = '..\ClassifierTrainedModels\%sClassStatusTrainedLRModel' % classStatus featuresValueCountsSaveFile = '..\ClassifierTrainedModels\%sClassStatusTrainedFeaturesValueCounts' % classStatus # test to make sure can open model and fvc save files: testModelSaveOpen = open(modelSaveFile, 'rb') testModelSaveOpen.close() testFVCSaveOpen = open(featuresValueCountsSaveFile, 'rb') testFVCSaveOpen.close() # now do test databaseInfo = GetDatabaseInfoScholarshipsWithClassStatuses() dataTextList = databaseInfo.getConcatenatedDescriptionsEligibilities() idsList = databaseInfo.getScholarshipsWithClassStatusIdsList() testClassify = LogisticRegressionClassifyClassStatusFromPretrainedModel( trainedModelInputFile=modelSaveFile, trainedFeaturesValueCountsIndexesFile=featuresValueCountsSaveFile, testingDataTextList=dataTextList, testingDataIdsList=idsList) testClassify.displayResults()
def __init__(self, classStatus, trainingPercentage, modelSaveFile=None, featuresValuesCountsSaveFile=None): self.classStatus = classStatus self.trainingPercentage = trainingPercentage self.badLabel = 'Other' self.modelSaveFile = modelSaveFile self.featuresValueCountsSaveFile = featuresValuesCountsSaveFile print('Creating datasets...') goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=self.classStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=self.classStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList() goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities() badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList() badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities() trainingTestingList = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=self.classStatus, secondLabel=self.badLabel, firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=0.8) self.training = trainingTestingList[0] self.testing = trainingTestingList[1] self.dataFrame = self.makeDataFrame() self.trainingVectors = [] self.testingVectors = [] self.featuresValueCountsIndexes = [] self.logisticRegressionClassifier = LogisticRegression()
def __init__(self, classStatus, trainingPercentage, modelSaveFile=None, featuresValuesCountsSaveFile=None): self.classStatus = classStatus self.trainingPercentage = trainingPercentage self.badLabel = 'Other' self.modelSaveFile = modelSaveFile self.featuresValueCountsSaveFile = featuresValuesCountsSaveFile print('Creating datasets...') goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=self.classStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=self.classStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList( ) goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities( ) badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList( ) badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities( ) trainingTestingList = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=self.classStatus, secondLabel=self.badLabel, firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=0.8) self.training = trainingTestingList[0] self.testing = trainingTestingList[1] self.dataFrame = self.makeDataFrame() self.trainingVectors = [] self.testingVectors = [] self.featuresValueCountsIndexes = [] self.logisticRegressionClassifier = LogisticRegression()
def test_makeTrainingTestingSetBadPercentage(self): desiredClassStatus = 'Senior' goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList() goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities() badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList() badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities() trainingPercentage = 2 trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage) self.assertIsNone(trainingSetAndTestingSet)
def test_makeTrainingTestingSets(self): desiredClassStatus = 'Junior' goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList() goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities() badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList() badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities() trainingPercentage = 0.9 trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage) training = trainingSetAndTestingSet[0] testing = trainingSetAndTestingSet[1] totalSetSize = len(training) + len(testing) self.assertEqual(math.ceil(totalSetSize * trainingPercentage), len(training)) self.assertEqual((len(goodClassStatusDataTextList) + len(badClassStatusDataTextList)), totalSetSize)