Esempi in Python per LendingClubFeatureExtractor

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: lendingClubFeatureExtractor

Classe/tipologia: LendingClubFeatureExtractor

Esempi su hotexamples.com: 5

LendingClubFeatureExtractor in Python: 5 esempi trovati. Questi sono i migliori esempi reali in Python per lendingClubFeatureExtractor.LendingClubFeatureExtractor, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

listIdx(3)

extractFeatures(2)

getTrainingData(2)

pcntRemove(1)

termConversion(1)

statusConversion(1)

stateEnumerator(1)

setTrainingData(1)

setOutCSVPath(1)

purposeEnumerator(1)

applyFeatureFilter(1)

loanGradeHash(1)

earlyCrLineConversion(1)

incomeVerifiedConversion(1)

homeOwnershipEnumerator(1)

getSampleCnt(1)

getRmvSampleCnt(1)

empLengthConversion(1)

writeFeaturesToCSV(1)

Esempio n. 1

Mostra file

File: lendingClubFeatureExtractorTest.py Progetto: hosse005/LoanLearner

    def setUp(self):
        """Set up the dependencies for the test execution"""

        # Construct our InputReader object, pass it the test csv file
        self.mInputReader = InputReader(testFile)

        # Construct the class under test with the InputReader
        self.mFeatureExtractor = LendingClubFeatureExtractor(self.mInputReader, filterTestFile)

Esempio n. 2

Mostra file

File: learningAgentTest.py Progetto: hosse005/LoanLearner

    def setUp( self ):
        '''Set up the dependencies for the test execution'''

        # Construct an InputReader and FeatureExtractor for dependency injection
        self.mInputReader = InputReader( testFile )
        self.mFeatureExtractor = LendingClubFeatureExtractor( self.mInputReader,
                                                              filterFile )
                                                              

        # Push our local test data into the FeatureExtractor
        self.mFeatureExtractor.setTrainingData( g_testArray )

        # Now, construct the class under test with the FeatureExtractor
        self.mLearningAgent = DummyLearningAgentImpl( self.mFeatureExtractor )

Esempio n. 3

Mostra file

File: learningAgentTest.py Progetto: hosse005/LoanLearner

class LearningAgentTest( unittest.TestCase ):

    def setUp( self ):
        '''Set up the dependencies for the test execution'''

        # Construct an InputReader and FeatureExtractor for dependency injection
        self.mInputReader = InputReader( testFile )
        self.mFeatureExtractor = LendingClubFeatureExtractor( self.mInputReader,
                                                              filterFile )
                                                              

        # Push our local test data into the FeatureExtractor
        self.mFeatureExtractor.setTrainingData( g_testArray )

        # Now, construct the class under test with the FeatureExtractor
        self.mLearningAgent = DummyLearningAgentImpl( self.mFeatureExtractor )


    def test_getTrainingData( self ):
        '''Test getTrainingData() function returns correct data'''
        np.testing.assert_array_equal( self.mLearningAgent.getTrainingData(),
                                       g_testArray )


    def test_sampleSlice( self ):
        '''Test sampleSlice() function correctly splits test and train data'''
        
        # Configure the data to be split evenly for the test
        self.mLearningAgent.sampleSlice( 0.5 )

        # Get the target feature index
        m_yIdx = self.mFeatureExtractor.listIdx( 'loan_status' )

        # Slice boundary
        mBnd = ceil( len(g_testArray) / 2 )
        
        # Generate X_train test array checksum
        m_X_train_sum = np.sum( g_testArray[:mBnd] )
        m_X_train_sum = m_X_train_sum - np.sum( g_testArray[:mBnd, m_yIdx] )

        # Verify sums match w/in some small tolerance
        self.assertTrue( fabs( m_X_train_sum - 
                               np.sum( self.mLearningAgent.X_train ) < 0.001 ) )

        # Generate X_test test array checksum
        m_X_test_sum = np.sum( g_testArray[mBnd:] )
        m_X_test_sum = m_X_test_sum - np.sum( g_testArray[mBnd:, m_yIdx] )
        
        # Verify sums match w/in some small tolerance
        self.assertTrue( fabs( m_X_test_sum - 
                               np.sum( self.mLearningAgent.X_test ) < 0.001 ) )

        # Generate y_train array checksum
        m_y_train_sum = np.sum( g_testArray[:mBnd, m_yIdx] )

        # Verify sums match w/in some small tolerance
        self.assertTrue( fabs( m_y_train_sum - 
                               np.sum( self.mLearningAgent.y_train ) < 0.001 ) )

        # Generate y_test array checksum
        m_y_test_sum = np.sum( g_testArray[mBnd:, m_yIdx] )

        # Verify sums match w/in some small tolerance
        self.assertTrue( fabs( m_y_test_sum - 
                               np.sum( self.mLearningAgent.y_test ) < 0.001 ) )


    def test_standardizeSamples( self ):
        '''
        Test standardizeSamples() function calculates mean and deviation and 
        applies it properly to all samples of a given feature
        Note: Just testing the first feature here
        '''

        # Configure the data to be split evenly for the test
        self.mLearningAgent.sampleSlice( 0.5 )

        # Calculate average and standard deviation
        mSum = sum( self.mLearningAgent.X_train[:, 0] )
        mAvg = mSum / len( self.mLearningAgent.X_train[:, 0] )
        mStdDev = sum( np.square( np.subtract( 
            self.mLearningAgent.X_train[:, 0], mAvg ) ) )
        mStdDev = sqrt( fabs( 
            mStdDev / len( self.mLearningAgent.X_train[:, 0] ) ) )

        # Apply calculated average and standard deviation to samples
        mNorm = np.divide( np.subtract( self.mLearningAgent.X_train[:, 0],
                                        mAvg ), mStdDev )

        # Execute LearningAgent implementation
        self.mLearningAgent.standardizeSamples()

        # Assert local calculation matches with LearningAgent implementation
        self.assertTrue( fabs( sum(
            np.subtract( mNorm, self.mLearningAgent.X_train[:, 0] ) ) )
                         < 0.001 )


    def test_shuffleSamples( self ):
        '''Test shuffleSamples() function shuffles samples correctly'''
        
        # Initialize random number generator w/ known seed
        np.random.seed( 1 )

        # Generate index list
        m_indices = np.random.permutation( len( g_testArray ) )

        # Make LearningAgent call w/ the same seed
        self.mLearningAgent.shuffleSamples( 1 )

        # Assert data is shuffled as expected
        np.testing.assert_array_equal( g_testArray[m_indices], 
                                       self.mLearningAgent.trainingData )

Esempio n. 4

Mostra file

File: lendingClubFeatureExtractorTest.py Progetto: hosse005/LoanLearner

class LendingClubFeatureExtractorTest(unittest.TestCase):
    def setUp(self):
        """Set up the dependencies for the test execution"""

        # Construct our InputReader object, pass it the test csv file
        self.mInputReader = InputReader(testFile)

        # Construct the class under test with the InputReader
        self.mFeatureExtractor = LendingClubFeatureExtractor(self.mInputReader, filterTestFile)

    def test_termConversion(self):
        """Test termEnumerator functionality"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("term")

        # Loop over all test data and assert proper enumeration
        for row in self.mFeatureExtractor.getTrainingData():
            term = self.mFeatureExtractor.termConversion(row)
            if re.search("36 months", row[idx]):
                self.assertEqual(36, term)
            elif re.search("60 months", row[idx]):
                self.assertEqual(60, term)
            else:
                raise ValueError("Encountered unsupported term value")

    def test_pcntRemove(self):
        """Test '%' removal"""

        # Loop over all test data and assert proper conversion
        for row in self.mFeatureExtractor.getTrainingData():
            int_rate = self.mFeatureExtractor.pcntRemove(row, "int_rate")
            revol_util = self.mFeatureExtractor.pcntRemove(row, "revol_util")

            # Assert that no '%' contained in the results
            self.assertFalse(re.search("%", str(int_rate)))
            self.assertFalse(re.search("%", str(revol_util)))

    def test_loanGradeHash(self):
        """Test loan grade hashing function"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("sub_grade")

        # Hardcode test dictionary
        mTestDict = {
            "A1": 1,
            "A2": 2,
            "A3": 3,
            "A4": 4,
            "A5": 5,
            "B1": 6,
            "B2": 7,
            "B3": 8,
            "B4": 9,
            "B5": 10,
            "C1": 11,
            "C2": 12,
            "C3": 13,
            "C4": 14,
            "C5": 15,
            "D1": 16,
            "D2": 17,
            "D3": 18,
            "D4": 19,
            "D5": 20,
            "E1": 21,
            "E2": 22,
            "E3": 23,
            "E4": 24,
            "E5": 25,
            "F1": 26,
            "F2": 27,
            "F3": 28,
            "F4": 29,
            "F5": 30,
            "G1": 31,
            "G2": 32,
            "G3": 33,
            "G4": 34,
            "G5": 35,
        }

        # Loop over all test data and assert correct hash is returned
        for row in self.mFeatureExtractor.getTrainingData():
            sub_grade_hash = self.mFeatureExtractor.loanGradeHash(row)
            testKey = row[idx]
            self.assertEqual(mTestDict[testKey], sub_grade_hash)

    def test_empLengthConversion(self):
        """Test employment length function"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("emp_length")

        # Loop over all test data and assert correct conversion is returned
        for row in self.mFeatureExtractor.getTrainingData():
            emp_length = self.mFeatureExtractor.empLengthConversion(row)

            # Convert function calculated text back to expected string
            if emp_length == 0.1:
                emp_length = "<"
            elif emp_length == 20:
                emp_length = "10"
            elif emp_length == 0:
                emp_length = "n/a"
            else:
                emp_length = str(emp_length)

            # Use converted emp_length for reg exp test against test resource
            self.assertTrue(re.search(emp_length, row[idx]))

    def test_homeOwnershipEnumerator(self):
        """Test home ownership enumeration"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("home_ownership")

        # Test dictionary
        mTestDict = {1: "RENT", 2: "MORTGAGE", 3: "OWN", 4: "OTHER"}

        # Loop over all test data and assert correct enumeration is returned
        for row in self.mFeatureExtractor.getTrainingData():
            homeOwnE = self.mFeatureExtractor.homeOwnershipEnumerator(row)

            # Convert function result back to expected string
            if homeOwnE == 1 or 2 or 3 or 4:
                homeOwn = mTestDict[homeOwnE]
            else:
                homeOwn = "FAIL"

            # Use converted homeOwn for reg exp test against test resource
            self.assertTrue(re.search(homeOwn, row[idx]))

    def test_incomeVerifiedConversion(self):
        """Test income verification conversion"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("is_inc_v")

        # Loop over all test data and assert correct conversion is returned
        for row in self.mFeatureExtractor.getTrainingData():
            is_inc_v = self.mFeatureExtractor.incomeVerifiedConversion(row)

            # Assert that when is_inc_v is 0, input contains 'Not' string
            if is_inc_v == 0:
                self.assertTrue(re.search("Not", row[idx]))
            else:
                self.assertFalse(re.search("Not", row[idx]))

    def test_purposeEnumerator(self):
        """Test loan purpose enumeration"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("purpose")

        # Test dictionary - This must align with UUT dict!!
        mTestDict = {
            1: "house",
            2: "home_improvement",
            3: "medical",
            4: "education",
            5: "debt_consolidation",
            7: "small_business",
            8: "major_purchase",
            9: "car",
            10: "credit_card",
            11: "wedding",
            12: "vacation",
        }

        # Loop over all test data and assert correct conversion is returned
        for row in self.mFeatureExtractor.getTrainingData():
            purpose = self.mFeatureExtractor.purposeEnumerator(row)

            # Assert string is found in test data based on returned enum
            if purpose in mTestDict.keys():
                self.assertEqual(mTestDict[purpose], re.search(mTestDict[purpose], row[idx]).group())

    def test_stateEnumerator(self):
        """Test state enumeration"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("addr_state")

        # Test dictionary - This must align with UUT dict!!
        mTestDict = {
            1: "AK",
            2: "AL",
            3: "AR",
            4: "AZ",
            5: "CA",
            6: "CO",
            7: "CT",
            8: "DC",
            9: "DE",
            10: "FL",
            11: "GA",
            12: "HI",
            13: "IA",
            14: "ID",
            15: "IL",
            16: "IN",
            17: "KS",
            18: "KY",
            19: "LA",
            20: "MA",
            21: "MD",
            22: "ME",
            23: "MI",
            24: "MN",
            25: "MO",
            26: "MS",
            27: "MT",
            28: "NC",
            29: "ND",
            30: "NE",
            31: "NH",
            32: "NJ",
            33: "NM",
            34: "NV",
            35: "NY",
            36: "OH",
            37: "OK",
            38: "OR",
            39: "PA",
            40: "PR",
            41: "RI",
            42: "SC",
            43: "SD",
            44: "TN",
            45: "TX",
            46: "UT",
            47: "VA",
            48: "VI",
            49: "VT",
            50: "WA",
            51: "WI",
            52: "WV",
            53: "WY",
        }

        # Loop over all test data and assert correct conversion is returned
        for row in self.mFeatureExtractor.getTrainingData():
            addr_stateE = self.mFeatureExtractor.stateEnumerator(row)

            # Assert returned value matches w/ test dictionary
            self.assertTrue(re.search(mTestDict[addr_stateE], row[idx]))

    def test_earlyCrLineConversion(self):
        """Test date conversion"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("earliest_cr_line")

        # Generate a test date
        testDate = ["01/01/1972  01:50"]

        # Time elapsed since 2014
        delta = 42

        # Push null entries into testDate to simulate feature placement in
        # the training set
        for i in range(idx):
            testDate.insert(0, "")

        # Assert time elapsed since epoch is correct for given test time
        self.assertEqual(delta, self.mFeatureExtractor.earlyCrLineConversion(testDate))

    def test_statusConversion(self):
        """Loan status conversion test"""

        # Grab appropriate column index
        idx = self.mFeatureExtractor.listIdx("loan_status")

        # Loop over all test data and assert correct conversion is returned
        for row in self.mFeatureExtractor.getTrainingData():
            loan_status = self.mFeatureExtractor.statusConversion(row)

            # Assert that correct loan_status is returned based on test input
            if loan_status == 0:
                self.assertTrue(re.search("Charged Off", row[idx]))
            elif loan_status == 1:
                self.assertTrue(re.search("Fully Paid", row[idx]))
            else:
                self.assertFalse(re.search("Fully Paid|Charged Off", row[idx]))

    def test_extractFeatures(self):
        """Feature extraction test"""

        # Get initial training sample count
        nSamples = self.mFeatureExtractor.getSampleCnt()

        # Invoke feature extraction on our test object
        self.mFeatureExtractor.extractFeatures()

        # Make local sample removal count
        nRmvSamples = nSamples - self.mFeatureExtractor.getSampleCnt()

        # Assert local removal calculation corresponds with actual
        self.assertEqual(nRmvSamples, self.mFeatureExtractor.getRmvSampleCnt())

Esempio n. 5

Mostra file

File: loanLearner.py Progetto: hosse005/LoanLearner

def main():

    # Construct an argument parser for cmd line interaction
    parser = argparse.ArgumentParser(
        description="This is a risk valuation \
    software package for loan analysis.  The software is used for predicting  \
    whether a given applicant is likely or not to repay a given loan."
    )

    # Application version readback option
    parser.add_argument("-v", "--version", action="version", version=appVersion)

    # Option to pass in an input file to be processed
    parser.add_argument("-i", "--input", dest="inputFile", help="Input File Name", required=False, default=defaultInput)

    # Option to specify the type of learning agent to be used
    parser.add_argument(
        "--classifier",
        dest="cls",
        help="Machine Learning classifier type. \n \
                         Current possible options are: \n \
                         'logistic'(default), 'SVM', 'dTree'",
        required=False,
        default="logistic",
    )

    # Option to specify the SVM kernel to be used
    parser.add_argument(
        "-k",
        "--kernel",
        dest="kernel",
        help="SVM kernel type. \n Possible options are: \n \
                         'linear', 'poly', 'rbf'(default), or 'sigmoid' ",
        required=False,
        default="rbf",
    )

    # Option to specify the test fraction used for learning
    parser.add_argument(
        "--testFraction",
        dest="tstFrac",
        help="Fraction of data to be used for test, must be \
                         between 0 and 1",
        required=False,
        default=0.2,
    )

    # Option to specify pre-training dump file
    parser.add_argument("-d", "--dump", dest="dumpFile", help="File location for pre-trained data dump", required=False)

    # Option to specify learning regularization parameter
    parser.add_argument(
        "-C", "--reg", dest="reg", help="Classifier regularization parameter", required=False, default=1
    )

    # Option to specify filter path
    parser.add_argument(
        "--filter",
        dest="filterPath",
        help="Feature Filter resource file",
        required=False,
        default="../res/FeatureFilter.csv",
    )

    # Option to predict output of some input sample(s)
    parser.add_argument(
        "-p",
        "--predict",
        dest="predict",
        help="Run application in prediction mode. \
                         Prediction input samples must be located at \
                         ${PROJ_DIR}/tmp/predictInputSamples.csv \
                         (must first train a classifier!!)",
        required=False,
        action="store_true",
    )

    # Grab the inputs passed
    args = parser.parse_args()
    m_inputFile = args.inputFile
    m_cls = args.cls
    m_kernel = args.kernel
    m_tstFrac = float(args.tstFrac)
    m_reg = float(args.reg)
    if args.dumpFile is not None:
        m_dumpFile = args.dumpFile
    else:
        m_dumpFile = None
    m_filter = args.filterPath
    m_predict = args.predict

    # Generate time stamp for performance monitoring
    t0 = time.time()

    # Branch on predict flag
    if m_predict is False:
        # Construct the InputReader w/ our input file
        mInputReader = InputReader(m_inputFile)

        # Next, construct our LendingClubFeatureExtractor object
        mFeatureExtractor = LendingClubFeatureExtractor(mInputReader, m_filter)

        # Use the FeatureExtractor to convert the data for learning
        mFeatureExtractor.extractFeatures()
        mFeatureExtractor.applyFeatureFilter()

        # Dump pre-trained data if specified by user
        if m_dumpFile is not None:
            mFeatureExtractor.setOutCSVPath(m_dumpFile)
            mFeatureExtractor.writeFeaturesToCSV()

        # Construct a LearningAgent based on user input
        if m_cls == "SVM":
            mLearningAgent = SVMClassifier(mFeatureExtractor, m_kernel)
        elif m_cls == "logistic":
            mLearningAgent = LogisticClassifier(mFeatureExtractor)
        elif m_cls == "dTree":
            mLearningAgent = DecisionTreeClassifier(mFeatureExtractor)
        else:
            print("Invalid classifier passed.  See --help for valid options")
            return

        # Set the test fraction of data to use for validation
        mLearningAgent.setTstFraction(m_tstFrac)

        # Set the learning regularization parameter
        mLearningAgent.setRegularization(m_reg)

        # Apply preprocessing to the training samples
        mLearningAgent.shuffleSamples()
        mLearningAgent.sampleSlice()
        mLearningAgent.standardizeSamples()

        # Train the classifier and report the accuracy against the test subset
        mLearningAgent.trainModel()
        print("Cross Validation accuracy on the test subset = %0.3f" % mLearningAgent.crossValidate())

        # Dump the classifier object to file
        mLearningAgent.dumpClassifier()

        # Print out the classifier coefficients
        if m_cls == "logistic" or m_cls == "dTree":
            print("Classifier coefficients:")
            print(mLearningAgent.getClfCoeffs())

        # Generate end time stamp and report processing time
        t1 = time.time()
        total = t1 - t0
        print("Total processing time = %3.2f seconds" % total)

    # Predict flag set, try read a stored classifier and push our inputs
    # through it
    else:
        # Construct an input reader
        mInputReader = InputReader(predictInput)

        # Next, construct our LendingClubFeatureExtractor object
        mFeatureExtractor = LendingClubFeatureExtractor(mInputReader, m_filter)

        # Use the FeatureExtractor to convert the data
        mFeatureExtractor.extractFeatures()
        mFeatureExtractor.applyFeatureFilter()

        # Dump pre-trained data if specified by user
        if m_dumpFile is not None:
            mFeatureExtractor.setOutCSVPath(m_dumpFile)
            mFeatureExtractor.writeFeaturesToCSV()

        # Try to read in the stored classifier
        try:
            clf = joblib.load(clfDumpLoc)
        except FileNotFoundError:
            print("Error! No classifier binary file found.")
            print("Did you train a classifier yet??")
            return

        # Get the output idx and remove the appropriate column from the input
        outputIdx = mFeatureExtractor.listIdx("loan_status")
        data = mFeatureExtractor.getTrainingData()
        data = np.delete(data, outputIdx, 1)

        # Standarize data to zero mean and unit variance - this should ideally
        # be same as classifier's scaling factor
        with open(scalerDumpLoc, "rb") as f:
            scaler = pickle.load(f)

        data = scaler.transform(data)

        # Loop through all of the inputs and make a prediction
        print()
        for sample in data:
            if clf.predict(sample) == 0:
                prediction = "Charged Off"
                result = 0
            else:
                prediction = "Fully Paid"
                result = 1

            print("Predicted outcome of loan: %s" % prediction)
            print("Certainty in outcome is: %.1f percent" % (clf.predict_proba(sample)[0][result] * 100))
            print()

        print(clf)
        print()