def setUp(self): """Set up the dependencies for the test execution""" # Construct our InputReader object, pass it the test csv file self.mInputReader = InputReader(testFile) # Construct the class under test with the InputReader self.mFeatureExtractor = LendingClubFeatureExtractor(self.mInputReader, filterTestFile)
def setUp( self ): '''Set up the dependencies for the test execution''' # Construct an InputReader and FeatureExtractor for dependency injection self.mInputReader = InputReader( testFile ) self.mFeatureExtractor = LendingClubFeatureExtractor( self.mInputReader, filterFile ) # Push our local test data into the FeatureExtractor self.mFeatureExtractor.setTrainingData( g_testArray ) # Now, construct the class under test with the FeatureExtractor self.mLearningAgent = DummyLearningAgentImpl( self.mFeatureExtractor )
class LearningAgentTest( unittest.TestCase ): def setUp( self ): '''Set up the dependencies for the test execution''' # Construct an InputReader and FeatureExtractor for dependency injection self.mInputReader = InputReader( testFile ) self.mFeatureExtractor = LendingClubFeatureExtractor( self.mInputReader, filterFile ) # Push our local test data into the FeatureExtractor self.mFeatureExtractor.setTrainingData( g_testArray ) # Now, construct the class under test with the FeatureExtractor self.mLearningAgent = DummyLearningAgentImpl( self.mFeatureExtractor ) def test_getTrainingData( self ): '''Test getTrainingData() function returns correct data''' np.testing.assert_array_equal( self.mLearningAgent.getTrainingData(), g_testArray ) def test_sampleSlice( self ): '''Test sampleSlice() function correctly splits test and train data''' # Configure the data to be split evenly for the test self.mLearningAgent.sampleSlice( 0.5 ) # Get the target feature index m_yIdx = self.mFeatureExtractor.listIdx( 'loan_status' ) # Slice boundary mBnd = ceil( len(g_testArray) / 2 ) # Generate X_train test array checksum m_X_train_sum = np.sum( g_testArray[:mBnd] ) m_X_train_sum = m_X_train_sum - np.sum( g_testArray[:mBnd, m_yIdx] ) # Verify sums match w/in some small tolerance self.assertTrue( fabs( m_X_train_sum - np.sum( self.mLearningAgent.X_train ) < 0.001 ) ) # Generate X_test test array checksum m_X_test_sum = np.sum( g_testArray[mBnd:] ) m_X_test_sum = m_X_test_sum - np.sum( g_testArray[mBnd:, m_yIdx] ) # Verify sums match w/in some small tolerance self.assertTrue( fabs( m_X_test_sum - np.sum( self.mLearningAgent.X_test ) < 0.001 ) ) # Generate y_train array checksum m_y_train_sum = np.sum( g_testArray[:mBnd, m_yIdx] ) # Verify sums match w/in some small tolerance self.assertTrue( fabs( m_y_train_sum - np.sum( self.mLearningAgent.y_train ) < 0.001 ) ) # Generate y_test array checksum m_y_test_sum = np.sum( g_testArray[mBnd:, m_yIdx] ) # Verify sums match w/in some small tolerance self.assertTrue( fabs( m_y_test_sum - np.sum( self.mLearningAgent.y_test ) < 0.001 ) ) def test_standardizeSamples( self ): ''' Test standardizeSamples() function calculates mean and deviation and applies it properly to all samples of a given feature Note: Just testing the first feature here ''' # Configure the data to be split evenly for the test self.mLearningAgent.sampleSlice( 0.5 ) # Calculate average and standard deviation mSum = sum( self.mLearningAgent.X_train[:, 0] ) mAvg = mSum / len( self.mLearningAgent.X_train[:, 0] ) mStdDev = sum( np.square( np.subtract( self.mLearningAgent.X_train[:, 0], mAvg ) ) ) mStdDev = sqrt( fabs( mStdDev / len( self.mLearningAgent.X_train[:, 0] ) ) ) # Apply calculated average and standard deviation to samples mNorm = np.divide( np.subtract( self.mLearningAgent.X_train[:, 0], mAvg ), mStdDev ) # Execute LearningAgent implementation self.mLearningAgent.standardizeSamples() # Assert local calculation matches with LearningAgent implementation self.assertTrue( fabs( sum( np.subtract( mNorm, self.mLearningAgent.X_train[:, 0] ) ) ) < 0.001 ) def test_shuffleSamples( self ): '''Test shuffleSamples() function shuffles samples correctly''' # Initialize random number generator w/ known seed np.random.seed( 1 ) # Generate index list m_indices = np.random.permutation( len( g_testArray ) ) # Make LearningAgent call w/ the same seed self.mLearningAgent.shuffleSamples( 1 ) # Assert data is shuffled as expected np.testing.assert_array_equal( g_testArray[m_indices], self.mLearningAgent.trainingData )
class LendingClubFeatureExtractorTest(unittest.TestCase): def setUp(self): """Set up the dependencies for the test execution""" # Construct our InputReader object, pass it the test csv file self.mInputReader = InputReader(testFile) # Construct the class under test with the InputReader self.mFeatureExtractor = LendingClubFeatureExtractor(self.mInputReader, filterTestFile) def test_termConversion(self): """Test termEnumerator functionality""" # Grab appropriate column index idx = self.mFeatureExtractor.listIdx("term") # Loop over all test data and assert proper enumeration for row in self.mFeatureExtractor.getTrainingData(): term = self.mFeatureExtractor.termConversion(row) if re.search("36 months", row[idx]): self.assertEqual(36, term) elif re.search("60 months", row[idx]): self.assertEqual(60, term) else: raise ValueError("Encountered unsupported term value") def test_pcntRemove(self): """Test '%' removal""" # Loop over all test data and assert proper conversion for row in self.mFeatureExtractor.getTrainingData(): int_rate = self.mFeatureExtractor.pcntRemove(row, "int_rate") revol_util = self.mFeatureExtractor.pcntRemove(row, "revol_util") # Assert that no '%' contained in the results self.assertFalse(re.search("%", str(int_rate))) self.assertFalse(re.search("%", str(revol_util))) def test_loanGradeHash(self): """Test loan grade hashing function""" # Grab appropriate column index idx = self.mFeatureExtractor.listIdx("sub_grade") # Hardcode test dictionary mTestDict = { "A1": 1, "A2": 2, "A3": 3, "A4": 4, "A5": 5, "B1": 6, "B2": 7, "B3": 8, "B4": 9, "B5": 10, "C1": 11, "C2": 12, "C3": 13, "C4": 14, "C5": 15, "D1": 16, "D2": 17, "D3": 18, "D4": 19, "D5": 20, "E1": 21, "E2": 22, "E3": 23, "E4": 24, "E5": 25, "F1": 26, "F2": 27, "F3": 28, "F4": 29, "F5": 30, "G1": 31, "G2": 32, "G3": 33, "G4": 34, "G5": 35, } # Loop over all test data and assert correct hash is returned for row in self.mFeatureExtractor.getTrainingData(): sub_grade_hash = self.mFeatureExtractor.loanGradeHash(row) testKey = row[idx] self.assertEqual(mTestDict[testKey], sub_grade_hash) def test_empLengthConversion(self): """Test employment length function""" # Grab appropriate column index idx = self.mFeatureExtractor.listIdx("emp_length") # Loop over all test data and assert correct conversion is returned for row in self.mFeatureExtractor.getTrainingData(): emp_length = self.mFeatureExtractor.empLengthConversion(row) # Convert function calculated text back to expected string if emp_length == 0.1: emp_length = "<" elif emp_length == 20: emp_length = "10" elif emp_length == 0: emp_length = "n/a" else: emp_length = str(emp_length) # Use converted emp_length for reg exp test against test resource self.assertTrue(re.search(emp_length, row[idx])) def test_homeOwnershipEnumerator(self): """Test home ownership enumeration""" # Grab appropriate column index idx = self.mFeatureExtractor.listIdx("home_ownership") # Test dictionary mTestDict = {1: "RENT", 2: "MORTGAGE", 3: "OWN", 4: "OTHER"} # Loop over all test data and assert correct enumeration is returned for row in self.mFeatureExtractor.getTrainingData(): homeOwnE = self.mFeatureExtractor.homeOwnershipEnumerator(row) # Convert function result back to expected string if homeOwnE == 1 or 2 or 3 or 4: homeOwn = mTestDict[homeOwnE] else: homeOwn = "FAIL" # Use converted homeOwn for reg exp test against test resource self.assertTrue(re.search(homeOwn, row[idx])) def test_incomeVerifiedConversion(self): """Test income verification conversion""" # Grab appropriate column index idx = self.mFeatureExtractor.listIdx("is_inc_v") # Loop over all test data and assert correct conversion is returned for row in self.mFeatureExtractor.getTrainingData(): is_inc_v = self.mFeatureExtractor.incomeVerifiedConversion(row) # Assert that when is_inc_v is 0, input contains 'Not' string if is_inc_v == 0: self.assertTrue(re.search("Not", row[idx])) else: self.assertFalse(re.search("Not", row[idx])) def test_purposeEnumerator(self): """Test loan purpose enumeration""" # Grab appropriate column index idx = self.mFeatureExtractor.listIdx("purpose") # Test dictionary - This must align with UUT dict!! mTestDict = { 1: "house", 2: "home_improvement", 3: "medical", 4: "education", 5: "debt_consolidation", 7: "small_business", 8: "major_purchase", 9: "car", 10: "credit_card", 11: "wedding", 12: "vacation", } # Loop over all test data and assert correct conversion is returned for row in self.mFeatureExtractor.getTrainingData(): purpose = self.mFeatureExtractor.purposeEnumerator(row) # Assert string is found in test data based on returned enum if purpose in mTestDict.keys(): self.assertEqual(mTestDict[purpose], re.search(mTestDict[purpose], row[idx]).group()) def test_stateEnumerator(self): """Test state enumeration""" # Grab appropriate column index idx = self.mFeatureExtractor.listIdx("addr_state") # Test dictionary - This must align with UUT dict!! mTestDict = { 1: "AK", 2: "AL", 3: "AR", 4: "AZ", 5: "CA", 6: "CO", 7: "CT", 8: "DC", 9: "DE", 10: "FL", 11: "GA", 12: "HI", 13: "IA", 14: "ID", 15: "IL", 16: "IN", 17: "KS", 18: "KY", 19: "LA", 20: "MA", 21: "MD", 22: "ME", 23: "MI", 24: "MN", 25: "MO", 26: "MS", 27: "MT", 28: "NC", 29: "ND", 30: "NE", 31: "NH", 32: "NJ", 33: "NM", 34: "NV", 35: "NY", 36: "OH", 37: "OK", 38: "OR", 39: "PA", 40: "PR", 41: "RI", 42: "SC", 43: "SD", 44: "TN", 45: "TX", 46: "UT", 47: "VA", 48: "VI", 49: "VT", 50: "WA", 51: "WI", 52: "WV", 53: "WY", } # Loop over all test data and assert correct conversion is returned for row in self.mFeatureExtractor.getTrainingData(): addr_stateE = self.mFeatureExtractor.stateEnumerator(row) # Assert returned value matches w/ test dictionary self.assertTrue(re.search(mTestDict[addr_stateE], row[idx])) def test_earlyCrLineConversion(self): """Test date conversion""" # Grab appropriate column index idx = self.mFeatureExtractor.listIdx("earliest_cr_line") # Generate a test date testDate = ["01/01/1972 01:50"] # Time elapsed since 2014 delta = 42 # Push null entries into testDate to simulate feature placement in # the training set for i in range(idx): testDate.insert(0, "") # Assert time elapsed since epoch is correct for given test time self.assertEqual(delta, self.mFeatureExtractor.earlyCrLineConversion(testDate)) def test_statusConversion(self): """Loan status conversion test""" # Grab appropriate column index idx = self.mFeatureExtractor.listIdx("loan_status") # Loop over all test data and assert correct conversion is returned for row in self.mFeatureExtractor.getTrainingData(): loan_status = self.mFeatureExtractor.statusConversion(row) # Assert that correct loan_status is returned based on test input if loan_status == 0: self.assertTrue(re.search("Charged Off", row[idx])) elif loan_status == 1: self.assertTrue(re.search("Fully Paid", row[idx])) else: self.assertFalse(re.search("Fully Paid|Charged Off", row[idx])) def test_extractFeatures(self): """Feature extraction test""" # Get initial training sample count nSamples = self.mFeatureExtractor.getSampleCnt() # Invoke feature extraction on our test object self.mFeatureExtractor.extractFeatures() # Make local sample removal count nRmvSamples = nSamples - self.mFeatureExtractor.getSampleCnt() # Assert local removal calculation corresponds with actual self.assertEqual(nRmvSamples, self.mFeatureExtractor.getRmvSampleCnt())
def main(): # Construct an argument parser for cmd line interaction parser = argparse.ArgumentParser( description="This is a risk valuation \ software package for loan analysis. The software is used for predicting \ whether a given applicant is likely or not to repay a given loan." ) # Application version readback option parser.add_argument("-v", "--version", action="version", version=appVersion) # Option to pass in an input file to be processed parser.add_argument("-i", "--input", dest="inputFile", help="Input File Name", required=False, default=defaultInput) # Option to specify the type of learning agent to be used parser.add_argument( "--classifier", dest="cls", help="Machine Learning classifier type. \n \ Current possible options are: \n \ 'logistic'(default), 'SVM', 'dTree'", required=False, default="logistic", ) # Option to specify the SVM kernel to be used parser.add_argument( "-k", "--kernel", dest="kernel", help="SVM kernel type. \n Possible options are: \n \ 'linear', 'poly', 'rbf'(default), or 'sigmoid' ", required=False, default="rbf", ) # Option to specify the test fraction used for learning parser.add_argument( "--testFraction", dest="tstFrac", help="Fraction of data to be used for test, must be \ between 0 and 1", required=False, default=0.2, ) # Option to specify pre-training dump file parser.add_argument("-d", "--dump", dest="dumpFile", help="File location for pre-trained data dump", required=False) # Option to specify learning regularization parameter parser.add_argument( "-C", "--reg", dest="reg", help="Classifier regularization parameter", required=False, default=1 ) # Option to specify filter path parser.add_argument( "--filter", dest="filterPath", help="Feature Filter resource file", required=False, default="../res/FeatureFilter.csv", ) # Option to predict output of some input sample(s) parser.add_argument( "-p", "--predict", dest="predict", help="Run application in prediction mode. \ Prediction input samples must be located at \ ${PROJ_DIR}/tmp/predictInputSamples.csv \ (must first train a classifier!!)", required=False, action="store_true", ) # Grab the inputs passed args = parser.parse_args() m_inputFile = args.inputFile m_cls = args.cls m_kernel = args.kernel m_tstFrac = float(args.tstFrac) m_reg = float(args.reg) if args.dumpFile is not None: m_dumpFile = args.dumpFile else: m_dumpFile = None m_filter = args.filterPath m_predict = args.predict # Generate time stamp for performance monitoring t0 = time.time() # Branch on predict flag if m_predict is False: # Construct the InputReader w/ our input file mInputReader = InputReader(m_inputFile) # Next, construct our LendingClubFeatureExtractor object mFeatureExtractor = LendingClubFeatureExtractor(mInputReader, m_filter) # Use the FeatureExtractor to convert the data for learning mFeatureExtractor.extractFeatures() mFeatureExtractor.applyFeatureFilter() # Dump pre-trained data if specified by user if m_dumpFile is not None: mFeatureExtractor.setOutCSVPath(m_dumpFile) mFeatureExtractor.writeFeaturesToCSV() # Construct a LearningAgent based on user input if m_cls == "SVM": mLearningAgent = SVMClassifier(mFeatureExtractor, m_kernel) elif m_cls == "logistic": mLearningAgent = LogisticClassifier(mFeatureExtractor) elif m_cls == "dTree": mLearningAgent = DecisionTreeClassifier(mFeatureExtractor) else: print("Invalid classifier passed. See --help for valid options") return # Set the test fraction of data to use for validation mLearningAgent.setTstFraction(m_tstFrac) # Set the learning regularization parameter mLearningAgent.setRegularization(m_reg) # Apply preprocessing to the training samples mLearningAgent.shuffleSamples() mLearningAgent.sampleSlice() mLearningAgent.standardizeSamples() # Train the classifier and report the accuracy against the test subset mLearningAgent.trainModel() print("Cross Validation accuracy on the test subset = %0.3f" % mLearningAgent.crossValidate()) # Dump the classifier object to file mLearningAgent.dumpClassifier() # Print out the classifier coefficients if m_cls == "logistic" or m_cls == "dTree": print("Classifier coefficients:") print(mLearningAgent.getClfCoeffs()) # Generate end time stamp and report processing time t1 = time.time() total = t1 - t0 print("Total processing time = %3.2f seconds" % total) # Predict flag set, try read a stored classifier and push our inputs # through it else: # Construct an input reader mInputReader = InputReader(predictInput) # Next, construct our LendingClubFeatureExtractor object mFeatureExtractor = LendingClubFeatureExtractor(mInputReader, m_filter) # Use the FeatureExtractor to convert the data mFeatureExtractor.extractFeatures() mFeatureExtractor.applyFeatureFilter() # Dump pre-trained data if specified by user if m_dumpFile is not None: mFeatureExtractor.setOutCSVPath(m_dumpFile) mFeatureExtractor.writeFeaturesToCSV() # Try to read in the stored classifier try: clf = joblib.load(clfDumpLoc) except FileNotFoundError: print("Error! No classifier binary file found.") print("Did you train a classifier yet??") return # Get the output idx and remove the appropriate column from the input outputIdx = mFeatureExtractor.listIdx("loan_status") data = mFeatureExtractor.getTrainingData() data = np.delete(data, outputIdx, 1) # Standarize data to zero mean and unit variance - this should ideally # be same as classifier's scaling factor with open(scalerDumpLoc, "rb") as f: scaler = pickle.load(f) data = scaler.transform(data) # Loop through all of the inputs and make a prediction print() for sample in data: if clf.predict(sample) == 0: prediction = "Charged Off" result = 0 else: prediction = "Fully Paid" result = 1 print("Predicted outcome of loan: %s" % prediction) print("Certainty in outcome is: %.1f percent" % (clf.predict_proba(sample)[0][result] * 100)) print() print(clf) print()