def __init__(self, classStatus, trainingPercentage, modelSaveFile=None, featuresValuesCountsSaveFile=None):
        self.classStatus = classStatus
        self.trainingPercentage = trainingPercentage
        self.badLabel = 'Other'
        self.modelSaveFile = modelSaveFile
        self.featuresValueCountsSaveFile = featuresValuesCountsSaveFile

        print('Creating datasets...')
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=self.classStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=self.classStatus, useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList()
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities()
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList()
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities()

        trainingTestingList = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=self.classStatus, secondLabel=self.badLabel, firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds, trainingPercentage=0.8)
        self.training = trainingTestingList[0]
        self.testing = trainingTestingList[1]

        self.dataFrame = self.makeDataFrame()

        self.trainingVectors = []
        self.testingVectors = []

        self.featuresValueCountsIndexes = []
        self.logisticRegressionClassifier = LogisticRegression()
Example #2
0
    def test_makeTrainingTestingSetBadPercentage(self):
        desiredClassStatus = 'Senior'
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=desiredClassStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=desiredClassStatus, useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )

        trainingPercentage = 2
        trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=desiredClassStatus,
            secondLabel='Other',
            firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList,
            firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds,
            trainingPercentage=trainingPercentage)

        self.assertIsNone(trainingSetAndTestingSet)
Example #3
0
    def test_makeTrainingTestingSets(self):
        desiredClassStatus = 'Junior'
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=desiredClassStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=desiredClassStatus, useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )

        trainingPercentage = 0.9
        trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=desiredClassStatus,
            secondLabel='Other',
            firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList,
            firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds,
            trainingPercentage=trainingPercentage)
        training = trainingSetAndTestingSet[0]
        testing = trainingSetAndTestingSet[1]
        totalSetSize = len(training) + len(testing)
        self.assertEqual(math.ceil(totalSetSize * trainingPercentage),
                         len(training))
        self.assertEqual((len(goodClassStatusDataTextList) +
                          len(badClassStatusDataTextList)), totalSetSize)
    def test_makeMultilabelTrainingAndTestingSets(self):
        dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses().getConcatenatedDescriptionsEligibilities()
        labelsList = GetDatabaseInfoScholarshipsWithClassStatuses().getRequirementNeededList()
        idsList = GetDatabaseInfoScholarshipsWithClassStatuses().getScholarshipsWithClassStatusIdsList()
        trainingPercentage = 0.9

        trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet(dataTextList,
                                                                                  labelsList,
                                                                                  idsList,
                                                                                  trainingPercentage)
        totalSetSize = len(trainingSet) + len(testingSet)

        self.assertEqual(math.ceil(totalSetSize * trainingPercentage), len(trainingSet))
    def __init__(self, dataTextList, labelsList, idsList, trainingPercentage=0.8):
        self.dataTextList = dataTextList
        self.labelsList = labelsList
        self.idsList = idsList
        self.trainingPercentage = trainingPercentage

        self.trainingSet, self.testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet(
            self.dataTextList, self.labelsList, self.idsList, self.trainingPercentage)

        self.dataFrame = self.makeDataFrame()

        self.featuresValueCountIndexes = []

        self.oneVsRestClassifier = OneVsRestClassifier(LogisticRegression())
    def test_SeeWhatTheMultilabelTrainingSetIsActuallyPassingAsLabels(self):
        dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses().getConcatenatedDescriptionsEligibilities()
        labelsList = GetDatabaseInfoScholarshipsWithClassStatuses().getRequirementNeededList()
        idsList = GetDatabaseInfoScholarshipsWithClassStatuses().getScholarshipsWithClassStatusIdsList()
        trainingPercentage = 0.9

        trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet(dataTextList,
                                                                                  labelsList,
                                                                                  idsList,
                                                                                  trainingPercentage)

        trainingLabels = [trainingInstance['label'] for trainingInstance in trainingSet]
        self.assertEqual(type(trainingLabels[1]), list)
        self.assertEqual(type(trainingLabels[1][0]), str)
Example #7
0
    def test_makeMultilabelTrainingAndTestingSets(self):
        dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getConcatenatedDescriptionsEligibilities()
        labelsList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getRequirementNeededList()
        idsList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getScholarshipsWithClassStatusIdsList()
        trainingPercentage = 0.9

        trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet(
            dataTextList, labelsList, idsList, trainingPercentage)
        totalSetSize = len(trainingSet) + len(testingSet)

        self.assertEqual(math.ceil(totalSetSize * trainingPercentage),
                         len(trainingSet))
Example #8
0
    def test_SeeWhatTheMultilabelTrainingSetIsActuallyPassingAsLabels(self):
        dataTextList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getConcatenatedDescriptionsEligibilities()
        labelsList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getRequirementNeededList()
        idsList = GetDatabaseInfoScholarshipsWithClassStatuses(
        ).getScholarshipsWithClassStatusIdsList()
        trainingPercentage = 0.9

        trainingSet, testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet(
            dataTextList, labelsList, idsList, trainingPercentage)

        trainingLabels = [
            trainingInstance['label'] for trainingInstance in trainingSet
        ]
        self.assertEqual(type(trainingLabels[1]), list)
        self.assertEqual(type(trainingLabels[1][0]), str)
    def __init__(self, pretrainedModelFile, pretrainedFeatureValueCountsFile, testingDataTextList, testingDataIdsList):
        self.pretrainedModelFile = pretrainedModelFile
        self.pretrainedFeatureValueCountsFile = pretrainedFeatureValueCountsFile
        self.testingDataTextList = testingDataTextList
        self.testingDataIdsList = testingDataIdsList

        self.testingSet = MakeDataSet.makeDataSet(labels='', dataTextList=self.testingDataTextList,
                                                  idsList=self.testingDataIdsList)

        self.dataFrame = self.makeDataFrame()

        pretrainedModelInput = open(self.pretrainedModelFile, 'rb')
        self.oneVsRestClassifier = pickle.load(pretrainedModelInput)
        pretrainedModelInput.close()

        pretrainedFeaturesValueCountsInput = open(self.pretrainedFeatureValueCountsFile, 'rb')
        self.pretrainedFeaturesValueCountIndexes = pickle.load(pretrainedFeaturesValueCountsInput)
        pretrainedFeaturesValueCountsInput.close()
    def test_makeTrainingTestingSetBadPercentage(self):
        desiredClassStatus = 'Senior'
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus,
                                                                        useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList()
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities()
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList()
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities()

        trainingPercentage = 2
        trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage)

        self.assertIsNone(trainingSetAndTestingSet)
Example #11
0
    def __init__(self,
                 classStatus,
                 trainingPercentage,
                 modelSaveFile=None,
                 featuresValuesCountsSaveFile=None):
        self.classStatus = classStatus
        self.trainingPercentage = trainingPercentage
        self.badLabel = 'Other'
        self.modelSaveFile = modelSaveFile
        self.featuresValueCountsSaveFile = featuresValuesCountsSaveFile

        print('Creating datasets...')
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=self.classStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(
            requirementNeeded=self.classStatus, useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList(
        )
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities(
        )

        trainingTestingList = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=self.classStatus,
            secondLabel=self.badLabel,
            firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList,
            firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds,
            trainingPercentage=0.8)
        self.training = trainingTestingList[0]
        self.testing = trainingTestingList[1]

        self.dataFrame = self.makeDataFrame()

        self.trainingVectors = []
        self.testingVectors = []

        self.featuresValueCountsIndexes = []
        self.logisticRegressionClassifier = LogisticRegression()
    def __init__(self, trainedModelInputFile, trainedFeaturesValueCountsIndexesFile, testingDataTextList,
                 testingDataIdsList):
        self.trainedFeaturesValueCountsIndexesFile = trainedFeaturesValueCountsIndexesFile
        self.trainedModelInputFile = trainedModelInputFile
        self.testingDataTextList = testingDataTextList
        self.testingDataIdsList = testingDataIdsList

        self.testing = MakeDataSet.makeDataSet(labels='', dataTextList=self.testingDataTextList,
                                               idsList=self.testingDataIdsList)

        self.dataFrame = self.makeDataFrame()

        modelInput = open(self.trainedModelInputFile, 'rb')
        self.logisticRegressionClassifier = pickle.load(modelInput)
        modelInput.close()

        featuresValueCountsInput = open(self.trainedFeaturesValueCountsIndexesFile, 'rb')
        self.featuresValueCountsIndexes = pickle.load(featuresValueCountsInput)
        featuresValueCountsInput.close()
    def __init__(self,
                 dataTextList,
                 labelsList,
                 idsList,
                 trainingPercentage=0.8):
        self.dataTextList = dataTextList
        self.labelsList = labelsList
        self.idsList = idsList
        self.trainingPercentage = trainingPercentage

        self.trainingSet, self.testingSet = MakeDataSet.makeMultilabelTrainingAndTestingSet(
            self.dataTextList, self.labelsList, self.idsList,
            self.trainingPercentage)

        self.dataFrame = self.makeDataFrame()

        self.featuresValueCountIndexes = []

        self.oneVsRestClassifier = OneVsRestClassifier(LogisticRegression())
    def test_makeTrainingTestingSets(self):
        desiredClassStatus = 'Junior'
        goodClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus)
        badClassStatusDB = GetDatabaseInfoScholarshipsWithClassStatuses(requirementNeeded=desiredClassStatus,
                                                                        useNot=True)

        goodClassStatusIds = goodClassStatusDB.getScholarshipsWithClassStatusIdsList()
        goodClassStatusDataTextList = goodClassStatusDB.getConcatenatedDescriptionsEligibilities()
        badClassStatusIds = badClassStatusDB.getScholarshipsWithClassStatusIdsList()
        badClassStatusDataTextList = badClassStatusDB.getConcatenatedDescriptionsEligibilities()

        trainingPercentage = 0.9
        trainingSetAndTestingSet = MakeDataSet.makeBinaryLabelTrainingAndTestingSet(
            firstLabel=desiredClassStatus, secondLabel='Other', firstLabelTextList=goodClassStatusDataTextList,
            secondLabelTextList=badClassStatusDataTextList, firstLabelIdsList=goodClassStatusIds,
            secondLabelIdsList=badClassStatusIds, trainingPercentage=trainingPercentage)
        training = trainingSetAndTestingSet[0]
        testing = trainingSetAndTestingSet[1]
        totalSetSize = len(training) + len(testing)
        self.assertEqual(math.ceil(totalSetSize * trainingPercentage), len(training))
        self.assertEqual((len(goodClassStatusDataTextList) + len(badClassStatusDataTextList)), totalSetSize)
    def __init__(self, pretrainedModelFile, pretrainedFeatureValueCountsFile,
                 testingDataTextList, testingDataIdsList):
        self.pretrainedModelFile = pretrainedModelFile
        self.pretrainedFeatureValueCountsFile = pretrainedFeatureValueCountsFile
        self.testingDataTextList = testingDataTextList
        self.testingDataIdsList = testingDataIdsList

        self.testingSet = MakeDataSet.makeDataSet(
            labels='',
            dataTextList=self.testingDataTextList,
            idsList=self.testingDataIdsList)

        self.dataFrame = self.makeDataFrame()

        pretrainedModelInput = open(self.pretrainedModelFile, 'rb')
        self.oneVsRestClassifier = pickle.load(pretrainedModelInput)
        pretrainedModelInput.close()

        pretrainedFeaturesValueCountsInput = open(
            self.pretrainedFeatureValueCountsFile, 'rb')
        self.pretrainedFeaturesValueCountIndexes = pickle.load(
            pretrainedFeaturesValueCountsInput)
        pretrainedFeaturesValueCountsInput.close()
Example #16
0
    def __init__(self, trainedModelInputFile,
                 trainedFeaturesValueCountsIndexesFile, testingDataTextList,
                 testingDataIdsList):
        self.trainedFeaturesValueCountsIndexesFile = trainedFeaturesValueCountsIndexesFile
        self.trainedModelInputFile = trainedModelInputFile
        self.testingDataTextList = testingDataTextList
        self.testingDataIdsList = testingDataIdsList

        self.testing = MakeDataSet.makeDataSet(
            labels='',
            dataTextList=self.testingDataTextList,
            idsList=self.testingDataIdsList)

        self.dataFrame = self.makeDataFrame()

        modelInput = open(self.trainedModelInputFile, 'rb')
        self.logisticRegressionClassifier = pickle.load(modelInput)
        modelInput.close()

        featuresValueCountsInput = open(
            self.trainedFeaturesValueCountsIndexesFile, 'rb')
        self.featuresValueCountsIndexes = pickle.load(featuresValueCountsInput)
        featuresValueCountsInput.close()