def train(self, table_name, indep, dep, numIter=100, optimizer='irls',precision=0.001): ''' Given train a logistic regression model on the specified table for the given set of independent and dependent variables Inputs : ======== table_name : (String) input table name indep : (String) column containing independent variables as an array, to be used to build the model on OR (list) a list of strings, where each element of the list is a column name of table_name or is a constant number dep : (string) the class label. Output : ======== The Model coefficients, r2, p_values and t_stats The function also returns the model object. ''' self.model = {} #If indep is a list, then the input is specified as a list of columns in a table. #1) First, we will transform any categorical columns in this list. #2) We will marshal the values from the columns into an array, that can be passed on to MADlib's logistic regression algorithm. if(isinstance(indep,[].__class__)): self.model['indep_org'] = indep table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,table_name, indep, dep) #Convert transformed independent columns into an array table_name, indep = convertsColsToArray(self.dbconn, table_name, indep, dep) else: self.model['indep_org'] = indep self.model['indep'] = indep self.model['dep'] = dep stmt = ''' select * from madlib.logregr('{table_name}','{dep}','{indep}',{numIter}, '{optimizer}', {precision}) ''' stmt = stmt.format(dep=dep, indep=self.model['indep'], table_name=table_name, numIter=numIter, optimizer=optimizer, precision=precision ) print '\nstatement :',stmt print '\n' cursor = self.dbconn.getCursor() cursor.execute(stmt) row_set = self.dbconn.fetchRowsFromCursor(cursor) mdl_params = self.dbconn.fetchModelParams(row_set) self.dbconn.printModel(row_set) for param in mdl_params: self.model[param] = mdl_params[param] return self.model
def predict(self, predict_table_name,actual_label_col='',threshold=0.5): ''' Return predicted values using the trained model. Also return precision, recall & f-measure Input: ====== predict_table_name : (String) the name of the table to be used for prediction actual_label_col : (String) the name of the actual label column (will be ignored if empty). threshold : (float), the probability beyond which the predicted values will be considered +ve (default: 0.5) Output: ======= A cursor to the row set of the results, including the predicted value as column 'prediction' ''' #If the independent columns specified in the training method were a list (instead of a column name of type array) #We should transform the independent columns in the predict table as well if(isinstance(self.model['indep_org'],[].__class__)): predict_table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,predict_table_name, self.model['indep_org'], actual_label_col) #Convert transformed independent columns into an array predict_table_name, indep = convertsColsToArray(self.dbconn, predict_table_name, indep, dep) stmt = ''' ''' if(threshold): stmt = ''' select *, case when (1.0/(1.0 + exp(-1.0*madlib.array_dot({indep}, array{coef}::real[])))) > {threshold} THEN 1 ELSE 0 end as prediction from {table_name} '''.format(coef=self.model['coef'], indep=self.model['indep'], table_name=predict_table_name, threshold=threshold ) else: #If threshold is not specified, we will return actual predictions stmt = ''' select *, (1.0/(1.0 + exp(-1.0*madlib.array_dot({indep}, array{coef}::real[])))) as prediction from {table_name} '''.format(coef=self.model['coef'], indep=self.model['indep'], table_name=predict_table_name ) print '\nstatement:',stmt print '\n' cursor = self.dbconn.getCursor() cursor.execute(stmt) return cursor
def train(self, table_name, indep, dep, numIter=100, optimizer='irls',precision=0.001): ''' Given train a logistic regression model on the specified table for the given set of independent and dependent variables Inputs : ======== table_name : (String) input table name indep : (String) column containing independent variables as an array, to be used to build the model on OR (list) a list of strings, where each element of the list is a column name of table_name or is a constant number dep : (string) the class label. Output : ======== The Model coefficients, r2, p_values and t_stats The function also returns the model object. ''' self.model = {} #If indep is a list, then the input is specified as a list of columns in a table. #1) First, we will transform any categorical columns in this list. #2) We will marshal the values from the columns into an array, that can be passed on to MADlib's logistic regression algorithm. if(isinstance(indep,[].__class__)): self.model['indep_org'] = indep table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,table_name, indep, dep) #Convert transformed independent columns into an array table_name, indep = convertsColsToArray(self.dbconn, table_name, indep, dep) else: self.model['indep_org'] = indep self.model['indep'] = indep self.model['dep'] = dep stmt = ''' select * from {madlib_schema}.logregr('{table_name}','{dep}','{indep}',{numIter}, '{optimizer}', {precision}) '''.format(dep=dep, indep=self.model['indep'], table_name=table_name, numIter=numIter, optimizer=optimizer, precision=precision, madlib_schema=self.dbconn.madlib_schema ) logging.info('statement :{0}'.format(stmt)) mdl_params = psql.read_frame(stmt, self.dbconn.getConnection()) for param in mdl_params.columns: self.model[param] = mdl_params.get(param)[0] return self.model, mdl_params
def predict(self, predict_table_name,actual_label_col='',threshold=0.5): ''' Return predicted values using the trained model. Also return precision, recall & f-measure Input: ====== predict_table_name : (String) the name of the table to be used for prediction actual_label_col : (String) the name of the actual label column (will be ignored if empty). threshold : (float), the probability beyond which the predicted values will be considered +ve (default: 0.5) Output: ======= A cursor to the row set of the results, including the predicted value as column 'prediction' ''' #If the independent columns specified in the training method were a list (instead of a column name of type array) #We should transform the independent columns in the predict table as well if(isinstance(self.model['indep_org'],[].__class__)): predict_table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,predict_table_name, self.model['indep_org'], actual_label_col) #Convert transformed independent columns into an array predict_table_name, indep = convertsColsToArray(self.dbconn, predict_table_name, indep, dep) stmt = ''' ''' if(threshold): stmt = ''' select *, case when (1.0/(1.0 + exp(-1.0*{madlib_schema}.array_dot({indep}, array{coef}::real[])))) > {threshold} THEN 1 ELSE 0 end as prediction from {table_name} '''.format(coef=self.model['coef'], indep=self.model['indep'], table_name=predict_table_name, threshold=threshold, madlib_schema=self.dbconn.getMADlibSchema() ) else: #If threshold is not specified, we will return actual predictions stmt = ''' select *, (1.0/(1.0 + exp(-1.0*{madlib_schema}.array_dot({indep}, array{coef}::real[])))) as prediction from {table_name} '''.format(coef=self.model['coef'], indep=self.model['indep'], table_name=predict_table_name, madlib_schema=self.dbconn.getMADlibSchema() ) logging.info('statement:{0}'.format(stmt)) prediction_results = psql.read_frame(stmt,self.dbconn.getConnection()) return prediction_results