def setUp(self):
        """Set up the dependencies for the test execution"""

        # Construct our InputReader object, pass it the test csv file
        self.mInputReader = InputReader(testFile)

        # Construct the class under test with the InputReader
        self.mFeatureExtractor = LendingClubFeatureExtractor(self.mInputReader, filterTestFile)
Esempio n. 2
0
    def setUp( self ):
        '''Set up the dependencies for the test execution'''

        # Construct an InputReader and FeatureExtractor for dependency injection
        self.mInputReader = InputReader( testFile )
        self.mFeatureExtractor = LendingClubFeatureExtractor( self.mInputReader,
                                                              filterFile )
                                                              

        # Push our local test data into the FeatureExtractor
        self.mFeatureExtractor.setTrainingData( g_testArray )

        # Now, construct the class under test with the FeatureExtractor
        self.mLearningAgent = DummyLearningAgentImpl( self.mFeatureExtractor )
Esempio n. 3
0
class LearningAgentTest( unittest.TestCase ):

    def setUp( self ):
        '''Set up the dependencies for the test execution'''

        # Construct an InputReader and FeatureExtractor for dependency injection
        self.mInputReader = InputReader( testFile )
        self.mFeatureExtractor = LendingClubFeatureExtractor( self.mInputReader,
                                                              filterFile )
                                                              

        # Push our local test data into the FeatureExtractor
        self.mFeatureExtractor.setTrainingData( g_testArray )

        # Now, construct the class under test with the FeatureExtractor
        self.mLearningAgent = DummyLearningAgentImpl( self.mFeatureExtractor )


    def test_getTrainingData( self ):
        '''Test getTrainingData() function returns correct data'''
        np.testing.assert_array_equal( self.mLearningAgent.getTrainingData(),
                                       g_testArray )


    def test_sampleSlice( self ):
        '''Test sampleSlice() function correctly splits test and train data'''
        
        # Configure the data to be split evenly for the test
        self.mLearningAgent.sampleSlice( 0.5 )

        # Get the target feature index
        m_yIdx = self.mFeatureExtractor.listIdx( 'loan_status' )

        # Slice boundary
        mBnd = ceil( len(g_testArray) / 2 )
        
        # Generate X_train test array checksum
        m_X_train_sum = np.sum( g_testArray[:mBnd] )
        m_X_train_sum = m_X_train_sum - np.sum( g_testArray[:mBnd, m_yIdx] )

        # Verify sums match w/in some small tolerance
        self.assertTrue( fabs( m_X_train_sum - 
                               np.sum( self.mLearningAgent.X_train ) < 0.001 ) )

        # Generate X_test test array checksum
        m_X_test_sum = np.sum( g_testArray[mBnd:] )
        m_X_test_sum = m_X_test_sum - np.sum( g_testArray[mBnd:, m_yIdx] )
        
        # Verify sums match w/in some small tolerance
        self.assertTrue( fabs( m_X_test_sum - 
                               np.sum( self.mLearningAgent.X_test ) < 0.001 ) )

        # Generate y_train array checksum
        m_y_train_sum = np.sum( g_testArray[:mBnd, m_yIdx] )

        # Verify sums match w/in some small tolerance
        self.assertTrue( fabs( m_y_train_sum - 
                               np.sum( self.mLearningAgent.y_train ) < 0.001 ) )

        # Generate y_test array checksum
        m_y_test_sum = np.sum( g_testArray[mBnd:, m_yIdx] )

        # Verify sums match w/in some small tolerance
        self.assertTrue( fabs( m_y_test_sum - 
                               np.sum( self.mLearningAgent.y_test ) < 0.001 ) )


    def test_standardizeSamples( self ):
        '''
        Test standardizeSamples() function calculates mean and deviation and 
        applies it properly to all samples of a given feature
        Note: Just testing the first feature here
        '''

        # Configure the data to be split evenly for the test
        self.mLearningAgent.sampleSlice( 0.5 )

        # Calculate average and standard deviation
        mSum = sum( self.mLearningAgent.X_train[:, 0] )
        mAvg = mSum / len( self.mLearningAgent.X_train[:, 0] )
        mStdDev = sum( np.square( np.subtract( 
            self.mLearningAgent.X_train[:, 0], mAvg ) ) )
        mStdDev = sqrt( fabs( 
            mStdDev / len( self.mLearningAgent.X_train[:, 0] ) ) )

        # Apply calculated average and standard deviation to samples
        mNorm = np.divide( np.subtract( self.mLearningAgent.X_train[:, 0],
                                        mAvg ), mStdDev )

        # Execute LearningAgent implementation
        self.mLearningAgent.standardizeSamples()

        # Assert local calculation matches with LearningAgent implementation
        self.assertTrue( fabs( sum(
            np.subtract( mNorm, self.mLearningAgent.X_train[:, 0] ) ) )
                         < 0.001 )


    def test_shuffleSamples( self ):
        '''Test shuffleSamples() function shuffles samples correctly'''
        
        # Initialize random number generator w/ known seed
        np.random.seed( 1 )

        # Generate index list
        m_indices = np.random.permutation( len( g_testArray ) )

        # Make LearningAgent call w/ the same seed
        self.mLearningAgent.shuffleSamples( 1 )

        # Assert data is shuffled as expected
        np.testing.assert_array_equal( g_testArray[m_indices], 
                                       self.mLearningAgent.trainingData )
class LendingClubFeatureExtractorTest(unittest.TestCase):
    def setUp(self):
        """Set up the dependencies for the test execution"""

        # Construct our InputReader object, pass it the test csv file
        self.mInputReader = InputReader(testFile)

        # Construct the class under test with the InputReader
        self.mFeatureExtractor = LendingClubFeatureExtractor(self.mInputReader, filterTestFile)

    def test_termConversion(self):
        """Test termEnumerator functionality"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("term")

        # Loop over all test data and assert proper enumeration
        for row in self.mFeatureExtractor.getTrainingData():
            term = self.mFeatureExtractor.termConversion(row)
            if re.search("36 months", row[idx]):
                self.assertEqual(36, term)
            elif re.search("60 months", row[idx]):
                self.assertEqual(60, term)
            else:
                raise ValueError("Encountered unsupported term value")

    def test_pcntRemove(self):
        """Test '%' removal"""

        # Loop over all test data and assert proper conversion
        for row in self.mFeatureExtractor.getTrainingData():
            int_rate = self.mFeatureExtractor.pcntRemove(row, "int_rate")
            revol_util = self.mFeatureExtractor.pcntRemove(row, "revol_util")

            # Assert that no '%' contained in the results
            self.assertFalse(re.search("%", str(int_rate)))
            self.assertFalse(re.search("%", str(revol_util)))

    def test_loanGradeHash(self):
        """Test loan grade hashing function"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("sub_grade")

        # Hardcode test dictionary
        mTestDict = {
            "A1": 1,
            "A2": 2,
            "A3": 3,
            "A4": 4,
            "A5": 5,
            "B1": 6,
            "B2": 7,
            "B3": 8,
            "B4": 9,
            "B5": 10,
            "C1": 11,
            "C2": 12,
            "C3": 13,
            "C4": 14,
            "C5": 15,
            "D1": 16,
            "D2": 17,
            "D3": 18,
            "D4": 19,
            "D5": 20,
            "E1": 21,
            "E2": 22,
            "E3": 23,
            "E4": 24,
            "E5": 25,
            "F1": 26,
            "F2": 27,
            "F3": 28,
            "F4": 29,
            "F5": 30,
            "G1": 31,
            "G2": 32,
            "G3": 33,
            "G4": 34,
            "G5": 35,
        }

        # Loop over all test data and assert correct hash is returned
        for row in self.mFeatureExtractor.getTrainingData():
            sub_grade_hash = self.mFeatureExtractor.loanGradeHash(row)
            testKey = row[idx]
            self.assertEqual(mTestDict[testKey], sub_grade_hash)

    def test_empLengthConversion(self):
        """Test employment length function"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("emp_length")

        # Loop over all test data and assert correct conversion is returned
        for row in self.mFeatureExtractor.getTrainingData():
            emp_length = self.mFeatureExtractor.empLengthConversion(row)

            # Convert function calculated text back to expected string
            if emp_length == 0.1:
                emp_length = "<"
            elif emp_length == 20:
                emp_length = "10"
            elif emp_length == 0:
                emp_length = "n/a"
            else:
                emp_length = str(emp_length)

            # Use converted emp_length for reg exp test against test resource
            self.assertTrue(re.search(emp_length, row[idx]))

    def test_homeOwnershipEnumerator(self):
        """Test home ownership enumeration"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("home_ownership")

        # Test dictionary
        mTestDict = {1: "RENT", 2: "MORTGAGE", 3: "OWN", 4: "OTHER"}

        # Loop over all test data and assert correct enumeration is returned
        for row in self.mFeatureExtractor.getTrainingData():
            homeOwnE = self.mFeatureExtractor.homeOwnershipEnumerator(row)

            # Convert function result back to expected string
            if homeOwnE == 1 or 2 or 3 or 4:
                homeOwn = mTestDict[homeOwnE]
            else:
                homeOwn = "FAIL"

            # Use converted homeOwn for reg exp test against test resource
            self.assertTrue(re.search(homeOwn, row[idx]))

    def test_incomeVerifiedConversion(self):
        """Test income verification conversion"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("is_inc_v")

        # Loop over all test data and assert correct conversion is returned
        for row in self.mFeatureExtractor.getTrainingData():
            is_inc_v = self.mFeatureExtractor.incomeVerifiedConversion(row)

            # Assert that when is_inc_v is 0, input contains 'Not' string
            if is_inc_v == 0:
                self.assertTrue(re.search("Not", row[idx]))
            else:
                self.assertFalse(re.search("Not", row[idx]))

    def test_purposeEnumerator(self):
        """Test loan purpose enumeration"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("purpose")

        # Test dictionary - This must align with UUT dict!!
        mTestDict = {
            1: "house",
            2: "home_improvement",
            3: "medical",
            4: "education",
            5: "debt_consolidation",
            7: "small_business",
            8: "major_purchase",
            9: "car",
            10: "credit_card",
            11: "wedding",
            12: "vacation",
        }

        # Loop over all test data and assert correct conversion is returned
        for row in self.mFeatureExtractor.getTrainingData():
            purpose = self.mFeatureExtractor.purposeEnumerator(row)

            # Assert string is found in test data based on returned enum
            if purpose in mTestDict.keys():
                self.assertEqual(mTestDict[purpose], re.search(mTestDict[purpose], row[idx]).group())

    def test_stateEnumerator(self):
        """Test state enumeration"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("addr_state")

        # Test dictionary - This must align with UUT dict!!
        mTestDict = {
            1: "AK",
            2: "AL",
            3: "AR",
            4: "AZ",
            5: "CA",
            6: "CO",
            7: "CT",
            8: "DC",
            9: "DE",
            10: "FL",
            11: "GA",
            12: "HI",
            13: "IA",
            14: "ID",
            15: "IL",
            16: "IN",
            17: "KS",
            18: "KY",
            19: "LA",
            20: "MA",
            21: "MD",
            22: "ME",
            23: "MI",
            24: "MN",
            25: "MO",
            26: "MS",
            27: "MT",
            28: "NC",
            29: "ND",
            30: "NE",
            31: "NH",
            32: "NJ",
            33: "NM",
            34: "NV",
            35: "NY",
            36: "OH",
            37: "OK",
            38: "OR",
            39: "PA",
            40: "PR",
            41: "RI",
            42: "SC",
            43: "SD",
            44: "TN",
            45: "TX",
            46: "UT",
            47: "VA",
            48: "VI",
            49: "VT",
            50: "WA",
            51: "WI",
            52: "WV",
            53: "WY",
        }

        # Loop over all test data and assert correct conversion is returned
        for row in self.mFeatureExtractor.getTrainingData():
            addr_stateE = self.mFeatureExtractor.stateEnumerator(row)

            # Assert returned value matches w/ test dictionary
            self.assertTrue(re.search(mTestDict[addr_stateE], row[idx]))

    def test_earlyCrLineConversion(self):
        """Test date conversion"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("earliest_cr_line")

        # Generate a test date
        testDate = ["01/01/1972  01:50"]

        # Time elapsed since 2014
        delta = 42

        # Push null entries into testDate to simulate feature placement in
        # the training set
        for i in range(idx):
            testDate.insert(0, "")

        # Assert time elapsed since epoch is correct for given test time
        self.assertEqual(delta, self.mFeatureExtractor.earlyCrLineConversion(testDate))

    def test_statusConversion(self):
        """Loan status conversion test"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("loan_status")

        # Loop over all test data and assert correct conversion is returned
        for row in self.mFeatureExtractor.getTrainingData():
            loan_status = self.mFeatureExtractor.statusConversion(row)

            # Assert that correct loan_status is returned based on test input
            if loan_status == 0:
                self.assertTrue(re.search("Charged Off", row[idx]))
            elif loan_status == 1:
                self.assertTrue(re.search("Fully Paid", row[idx]))
            else:
                self.assertFalse(re.search("Fully Paid|Charged Off", row[idx]))

    def test_extractFeatures(self):
        """Feature extraction test"""

        # Get initial training sample count
        nSamples = self.mFeatureExtractor.getSampleCnt()

        # Invoke feature extraction on our test object
        self.mFeatureExtractor.extractFeatures()

        # Make local sample removal count
        nRmvSamples = nSamples - self.mFeatureExtractor.getSampleCnt()

        # Assert local removal calculation corresponds with actual
        self.assertEqual(nRmvSamples, self.mFeatureExtractor.getRmvSampleCnt())
Esempio n. 5
0
def main():

    # Construct an argument parser for cmd line interaction
    parser = argparse.ArgumentParser(
        description="This is a risk valuation \
    software package for loan analysis.  The software is used for predicting  \
    whether a given applicant is likely or not to repay a given loan."
    )

    # Application version readback option
    parser.add_argument("-v", "--version", action="version", version=appVersion)

    # Option to pass in an input file to be processed
    parser.add_argument("-i", "--input", dest="inputFile", help="Input File Name", required=False, default=defaultInput)

    # Option to specify the type of learning agent to be used
    parser.add_argument(
        "--classifier",
        dest="cls",
        help="Machine Learning classifier type. \n \
                         Current possible options are: \n \
                         'logistic'(default), 'SVM', 'dTree'",
        required=False,
        default="logistic",
    )

    # Option to specify the SVM kernel to be used
    parser.add_argument(
        "-k",
        "--kernel",
        dest="kernel",
        help="SVM kernel type. \n Possible options are: \n \
                         'linear', 'poly', 'rbf'(default), or 'sigmoid' ",
        required=False,
        default="rbf",
    )

    # Option to specify the test fraction used for learning
    parser.add_argument(
        "--testFraction",
        dest="tstFrac",
        help="Fraction of data to be used for test, must be \
                         between 0 and 1",
        required=False,
        default=0.2,
    )

    # Option to specify pre-training dump file
    parser.add_argument("-d", "--dump", dest="dumpFile", help="File location for pre-trained data dump", required=False)

    # Option to specify learning regularization parameter
    parser.add_argument(
        "-C", "--reg", dest="reg", help="Classifier regularization parameter", required=False, default=1
    )

    # Option to specify filter path
    parser.add_argument(
        "--filter",
        dest="filterPath",
        help="Feature Filter resource file",
        required=False,
        default="../res/FeatureFilter.csv",
    )

    # Option to predict output of some input sample(s)
    parser.add_argument(
        "-p",
        "--predict",
        dest="predict",
        help="Run application in prediction mode. \
                         Prediction input samples must be located at \
                         ${PROJ_DIR}/tmp/predictInputSamples.csv \
                         (must first train a classifier!!)",
        required=False,
        action="store_true",
    )

    # Grab the inputs passed
    args = parser.parse_args()
    m_inputFile = args.inputFile
    m_cls = args.cls
    m_kernel = args.kernel
    m_tstFrac = float(args.tstFrac)
    m_reg = float(args.reg)
    if args.dumpFile is not None:
        m_dumpFile = args.dumpFile
    else:
        m_dumpFile = None
    m_filter = args.filterPath
    m_predict = args.predict

    # Generate time stamp for performance monitoring
    t0 = time.time()

    # Branch on predict flag
    if m_predict is False:
        # Construct the InputReader w/ our input file
        mInputReader = InputReader(m_inputFile)

        # Next, construct our LendingClubFeatureExtractor object
        mFeatureExtractor = LendingClubFeatureExtractor(mInputReader, m_filter)

        # Use the FeatureExtractor to convert the data for learning
        mFeatureExtractor.extractFeatures()
        mFeatureExtractor.applyFeatureFilter()

        # Dump pre-trained data if specified by user
        if m_dumpFile is not None:
            mFeatureExtractor.setOutCSVPath(m_dumpFile)
            mFeatureExtractor.writeFeaturesToCSV()

        # Construct a LearningAgent based on user input
        if m_cls == "SVM":
            mLearningAgent = SVMClassifier(mFeatureExtractor, m_kernel)
        elif m_cls == "logistic":
            mLearningAgent = LogisticClassifier(mFeatureExtractor)
        elif m_cls == "dTree":
            mLearningAgent = DecisionTreeClassifier(mFeatureExtractor)
        else:
            print("Invalid classifier passed.  See --help for valid options")
            return

        # Set the test fraction of data to use for validation
        mLearningAgent.setTstFraction(m_tstFrac)

        # Set the learning regularization parameter
        mLearningAgent.setRegularization(m_reg)

        # Apply preprocessing to the training samples
        mLearningAgent.shuffleSamples()
        mLearningAgent.sampleSlice()
        mLearningAgent.standardizeSamples()

        # Train the classifier and report the accuracy against the test subset
        mLearningAgent.trainModel()
        print("Cross Validation accuracy on the test subset = %0.3f" % mLearningAgent.crossValidate())

        # Dump the classifier object to file
        mLearningAgent.dumpClassifier()

        # Print out the classifier coefficients
        if m_cls == "logistic" or m_cls == "dTree":
            print("Classifier coefficients:")
            print(mLearningAgent.getClfCoeffs())

        # Generate end time stamp and report processing time
        t1 = time.time()
        total = t1 - t0
        print("Total processing time = %3.2f seconds" % total)

    # Predict flag set, try read a stored classifier and push our inputs
    # through it
    else:
        # Construct an input reader
        mInputReader = InputReader(predictInput)

        # Next, construct our LendingClubFeatureExtractor object
        mFeatureExtractor = LendingClubFeatureExtractor(mInputReader, m_filter)

        # Use the FeatureExtractor to convert the data
        mFeatureExtractor.extractFeatures()
        mFeatureExtractor.applyFeatureFilter()

        # Dump pre-trained data if specified by user
        if m_dumpFile is not None:
            mFeatureExtractor.setOutCSVPath(m_dumpFile)
            mFeatureExtractor.writeFeaturesToCSV()

        # Try to read in the stored classifier
        try:
            clf = joblib.load(clfDumpLoc)
        except FileNotFoundError:
            print("Error! No classifier binary file found.")
            print("Did you train a classifier yet??")
            return

        # Get the output idx and remove the appropriate column from the input
        outputIdx = mFeatureExtractor.listIdx("loan_status")
        data = mFeatureExtractor.getTrainingData()
        data = np.delete(data, outputIdx, 1)

        # Standarize data to zero mean and unit variance - this should ideally
        # be same as classifier's scaling factor
        with open(scalerDumpLoc, "rb") as f:
            scaler = pickle.load(f)

        data = scaler.transform(data)

        # Loop through all of the inputs and make a prediction
        print()
        for sample in data:
            if clf.predict(sample) == 0:
                prediction = "Charged Off"
                result = 0
            else:
                prediction = "Fully Paid"
                result = 1

            print("Predicted outcome of loan: %s" % prediction)
            print("Certainty in outcome is: %.1f percent" % (clf.predict_proba(sample)[0][result] * 100))
            print()

        print(clf)
        print()