Beispiel #1
0
    def create_full_analysis_dataset(self, file_location):
        ##### RUN TESTING DATASET THROUGH PREDICTIVE MODEL #####
        # open dataset (NOTE: date for analysis set is set in 'op')
        self.t_data = op.open_and_process(file_location, 'transform', 'model2')
        print "PREPARING & RUNNING TEST DATASET THROUGH PREDICTIVE MODELS"
        test_data = self.t_data[
            self.t_data['issue_d'] >= pa.parse_date("2011-01-01")]
        test_data = test_data[test_data['issue_d'] <= "2011-12-31"]
        # delete unneeded columns
        del test_data['status_in_good_standing']
        del test_data['issue_d']
        scaled_test_data = self.scaler.transform(test_data)
        # get lineary regression prediction data (weights)
        lr_prediction_weights = self.lr_clf.decision_function(scaled_test_data)
        # get random forrest prediction data
        rf_predictions = self.rndm_forest.predict(scaled_test_data)

        print "CREATING ANALYSIS DATAFRAME"
        ##### LINK PREDICTION WEIGHTS WITH DATA_TABLE USED FOR ANALYSIS #####
        self.a_data = op.open_and_process(file_location, 'transform',
                                          'analysis')

        analysis_data = self.a_data
        # only use where index matches those from test_dataset
        analysis_data = analysis_data.loc[test_data.index]
        print "months included in test data: " + str(
            set(analysis_data['yy-mm_start_date']))

        analysis_data['lin_reg_weights'] = lr_prediction_weights
        analysis_data['rndm_forrest_predictions'] = rf_predictions
        ####analysis_data = analysis_data.dropna(axis=0)

        # saving for problem sovling
        print "ANALYSIS COMPLETE"
        self.analysis_df = analysis_data
Beispiel #2
0
    def prep_data(self, file_location):
        # open dataset (NOTE: date for training set is set in 'op')
        self.data = op.open_and_process(file_location, 'transform', 'model')

        # separate by issue date (only train using data from before issue date X)
        self.training_data = self.data[
            self.data['issue_d'] < pa.parse_date("2011-01-01")]
        # delete issue date column (no longer needed)
        del self.training_data['issue_d']
        print self.training_data.columns
        # separate predictors into it's own dataset
        self.predictors = self.training_data['status_in_good_standing']

        # separate features from original training dataset
        self.training_features = self.training_data
        del self.training_features['status_in_good_standing']
Beispiel #3
0
	def open_file(self, file_location):
		d = op.open_and_process(file_location, 'transform', 'model', 'train')
		del d['status_Default']
		del d['issue_d']
		self.data = d
		print "run .assign_to_groups()"