def __init__(self, classStatus, trainingPercentage, modelSaveFile=None, featuresValuesCountsSaveFile=None): self.classStatus = classStatus self.trainingPercentage = trainingPercentage self.badLabel = 'Other' self.modelSaveFile = modelSaveFile self.featuresValueCountsSaveFile = featuresValuesCountsSaveFile print('Creating datasets...') goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=self.classStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=self.classStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList() goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities() badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList() badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities() trainingTestingList = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=self.classStatus, secondLabel=self.badLabel, firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=0.8) self.training = trainingTestingList[0] self.testing = trainingTestingList[1] self.dataFrame = self.makeDataFrame() self.trainingVectors = [] self.testingVectors = [] self.featuresValueCountsIndexes = [] self.logisticRegressionClassifier = LogisticRegression()
def test_makeTrainingTestingSetBadPercentage(self): desiredClassStatus = 'Senior' goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=desiredClassStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=desiredClassStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList( ) goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities( ) badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList( ) badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities( ) trainingPercentage = 2 trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage) self.assertIsNone(trainingSetAndTestingSet)
def test_makeTrainingTestingSets(self): desiredClassStatus = 'Junior' goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=desiredClassStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=desiredClassStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList( ) goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities( ) badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList( ) badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities( ) trainingPercentage = 0.9 trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage) training = trainingSetAndTestingSet[0] testing = trainingSetAndTestingSet[1] totalSetSize = len(training) + len(testing) self.assertEqual(math.ceil(totalSetSize * trainingPercentage), len(training)) self.assertEqual((len(goodClassStatusDataTextList) + len(badClassStatusDataTextList)), totalSetSize)
def test_makeMultilabelTrainingAndTestingSets(self): dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses().getConcatenatedDescriptionsEligibilities() labelsList = GetDatabaseInfoScholarshipsWithClassStatuses().getRequirementNeededList() idsList = GetDatabaseInfoScholarshipsWithClassStatuses().getScholarshipsWithClassStatusIdsList() trainingPercentage = 0.9 trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet(dataTextList, labelsList, idsList, trainingPercentage) totalSetSize = len(trainingSet) + len(testingSet) self.assertEqual(math.ceil(totalSetSize * trainingPercentage), len(trainingSet))
def __init__(self, dataTextList, labelsList, idsList, trainingPercentage=0.8): self.dataTextList = dataTextList self.labelsList = labelsList self.idsList = idsList self.trainingPercentage = trainingPercentage self.trainingSet, self.testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet( self.dataTextList, self.labelsList, self.idsList, self.trainingPercentage) self.dataFrame = self.makeDataFrame() self.featuresValueCountIndexes = [] self.oneVsRestClassifier = OneVsRestClassifier(LogisticRegression())
def test_SeeWhatTheMultilabelTrainingSetIsActuallyPassingAsLabels(self): dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses().getConcatenatedDescriptionsEligibilities() labelsList = GetDatabaseInfoScholarshipsWithClassStatuses().getRequirementNeededList() idsList = GetDatabaseInfoScholarshipsWithClassStatuses().getScholarshipsWithClassStatusIdsList() trainingPercentage = 0.9 trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet(dataTextList, labelsList, idsList, trainingPercentage) trainingLabels = [trainingInstance['label'] for trainingInstance in trainingSet] self.assertEqual(type(trainingLabels[1]), list) self.assertEqual(type(trainingLabels[1][0]), str)
def test_makeMultilabelTrainingAndTestingSets(self): dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getConcatenatedDescriptionsEligibilities() labelsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getRequirementNeededList() idsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getScholarshipsWithClassStatusIdsList() trainingPercentage = 0.9 trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet( dataTextList, labelsList, idsList, trainingPercentage) totalSetSize = len(trainingSet) + len(testingSet) self.assertEqual(math.ceil(totalSetSize * trainingPercentage), len(trainingSet))
def test_SeeWhatTheMultilabelTrainingSetIsActuallyPassingAsLabels(self): dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getConcatenatedDescriptionsEligibilities() labelsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getRequirementNeededList() idsList = GetDatabaseInfoScholarshipsWithClassStatuses( ).getScholarshipsWithClassStatusIdsList() trainingPercentage = 0.9 trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet( dataTextList, labelsList, idsList, trainingPercentage) trainingLabels = [ trainingInstance['label'] for trainingInstance in trainingSet ] self.assertEqual(type(trainingLabels[1]), list) self.assertEqual(type(trainingLabels[1][0]), str)
def __init__(self, pretrainedModelFile, pretrainedFeatureValueCountsFile, testingDataTextList, testingDataIdsList): self.pretrainedModelFile = pretrainedModelFile self.pretrainedFeatureValueCountsFile = pretrainedFeatureValueCountsFile self.testingDataTextList = testingDataTextList self.testingDataIdsList = testingDataIdsList self.testingSet = MakeDataSet.makeDataSet(labels='', dataTextList=self.testingDataTextList, idsList=self.testingDataIdsList) self.dataFrame = self.makeDataFrame() pretrainedModelInput = open(self.pretrainedModelFile, 'rb') self.oneVsRestClassifier = pickle.load(pretrainedModelInput) pretrainedModelInput.close() pretrainedFeaturesValueCountsInput = open(self.pretrainedFeatureValueCountsFile, 'rb') self.pretrainedFeaturesValueCountIndexes = pickle.load(pretrainedFeaturesValueCountsInput) pretrainedFeaturesValueCountsInput.close()
def test_makeTrainingTestingSetBadPercentage(self): desiredClassStatus = 'Senior' goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList() goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities() badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList() badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities() trainingPercentage = 2 trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage) self.assertIsNone(trainingSetAndTestingSet)
def __init__(self, classStatus, trainingPercentage, modelSaveFile=None, featuresValuesCountsSaveFile=None): self.classStatus = classStatus self.trainingPercentage = trainingPercentage self.badLabel = 'Other' self.modelSaveFile = modelSaveFile self.featuresValueCountsSaveFile = featuresValuesCountsSaveFile print('Creating datasets...') goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=self.classStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses( requirementNeeded=self.classStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList( ) goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities( ) badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList( ) badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities( ) trainingTestingList = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=self.classStatus, secondLabel=self.badLabel, firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=0.8) self.training = trainingTestingList[0] self.testing = trainingTestingList[1] self.dataFrame = self.makeDataFrame() self.trainingVectors = [] self.testingVectors = [] self.featuresValueCountsIndexes = [] self.logisticRegressionClassifier = LogisticRegression()
def __init__(self, trainedModelInputFile, trainedFeaturesValueCountsIndexesFile, testingDataTextList, testingDataIdsList): self.trainedFeaturesValueCountsIndexesFile = trainedFeaturesValueCountsIndexesFile self.trainedModelInputFile = trainedModelInputFile self.testingDataTextList = testingDataTextList self.testingDataIdsList = testingDataIdsList self.testing = MakeDataSet.makeDataSet(labels='', dataTextList=self.testingDataTextList, idsList=self.testingDataIdsList) self.dataFrame = self.makeDataFrame() modelInput = open(self.trainedModelInputFile, 'rb') self.logisticRegressionClassifier = pickle.load(modelInput) modelInput.close() featuresValueCountsInput = open(self.trainedFeaturesValueCountsIndexesFile, 'rb') self.featuresValueCountsIndexes = pickle.load(featuresValueCountsInput) featuresValueCountsInput.close()
def test_makeTrainingTestingSets(self): desiredClassStatus = 'Junior' goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus) badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus, useNot=True) goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList() goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities() badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList() badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities() trainingPercentage = 0.9 trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet( firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList, secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds, secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage) training = trainingSetAndTestingSet[0] testing = trainingSetAndTestingSet[1] totalSetSize = len(training) + len(testing) self.assertEqual(math.ceil(totalSetSize * trainingPercentage), len(training)) self.assertEqual((len(goodClassStatusDataTextList) + len(badClassStatusDataTextList)), totalSetSize)
def __init__(self, pretrainedModelFile, pretrainedFeatureValueCountsFile, testingDataTextList, testingDataIdsList): self.pretrainedModelFile = pretrainedModelFile self.pretrainedFeatureValueCountsFile = pretrainedFeatureValueCountsFile self.testingDataTextList = testingDataTextList self.testingDataIdsList = testingDataIdsList self.testingSet = MakeDataSet.makeDataSet( labels='', dataTextList=self.testingDataTextList, idsList=self.testingDataIdsList) self.dataFrame = self.makeDataFrame() pretrainedModelInput = open(self.pretrainedModelFile, 'rb') self.oneVsRestClassifier = pickle.load(pretrainedModelInput) pretrainedModelInput.close() pretrainedFeaturesValueCountsInput = open( self.pretrainedFeatureValueCountsFile, 'rb') self.pretrainedFeaturesValueCountIndexes = pickle.load( pretrainedFeaturesValueCountsInput) pretrainedFeaturesValueCountsInput.close()
def __init__(self, trainedModelInputFile, trainedFeaturesValueCountsIndexesFile, testingDataTextList, testingDataIdsList): self.trainedFeaturesValueCountsIndexesFile = trainedFeaturesValueCountsIndexesFile self.trainedModelInputFile = trainedModelInputFile self.testingDataTextList = testingDataTextList self.testingDataIdsList = testingDataIdsList self.testing = MakeDataSet.makeDataSet( labels='', dataTextList=self.testingDataTextList, idsList=self.testingDataIdsList) self.dataFrame = self.makeDataFrame() modelInput = open(self.trainedModelInputFile, 'rb') self.logisticRegressionClassifier = pickle.load(modelInput) modelInput.close() featuresValueCountsInput = open( self.trainedFeaturesValueCountsIndexesFile, 'rb') self.featuresValueCountsIndexes = pickle.load(featuresValueCountsInput) featuresValueCountsInput.close()