Ejemplo n.º 1
0
 def train(self, table_name, indep, dep, numIter=100, optimizer='irls',precision=0.001):
     ''' 
       Given train a logistic regression model on the specified table 
       for the given set of independent and dependent variables 
       Inputs :
       ========
       table_name : (String) input table name
       indep : (String) column containing independent variables as an array, to be used to build the model on
                                              OR
               (list) a list of strings, where each element of the list is a column name of table_name or is a constant number                      
       dep : (string) the class label.
       
       Output :
       ========
       The Model coefficients, r2, p_values and t_stats
       The function also returns the model object.            
     '''
     self.model = {}
     #If indep is a list, then the input is specified as a list of columns in a table.
     #1) First, we will transform any categorical columns in this list.
     #2) We will marshal the values from the columns into an array, that can be passed on to MADlib's logistic regression algorithm.
     if(isinstance(indep,[].__class__)):
         self.model['indep_org'] = indep
         table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,table_name, indep, dep)
         #Convert transformed independent columns into an array
         table_name, indep = convertsColsToArray(self.dbconn, table_name, indep, dep)
     else:
         self.model['indep_org'] = indep
        
     self.model['indep'] = indep
     self.model['dep'] = dep
     
     stmt = '''
               select * 
               from madlib.logregr('{table_name}','{dep}','{indep}',{numIter}, '{optimizer}', {precision}) 
            '''
     stmt = stmt.format(dep=dep,
                        indep=self.model['indep'],
                        table_name=table_name,
                        numIter=numIter,
                        optimizer=optimizer,
                        precision=precision
                       ) 
     
     print '\nstatement :',stmt
     print '\n' 
     cursor = self.dbconn.getCursor()
     cursor.execute(stmt)
     
     row_set = self.dbconn.fetchRowsFromCursor(cursor)
     mdl_params = self.dbconn.fetchModelParams(row_set)
     self.dbconn.printModel(row_set)
     
     for param in mdl_params:
         self.model[param] = mdl_params[param]
         
     return self.model
Ejemplo n.º 2
0
        def predict(self, predict_table_name,actual_label_col='',threshold=0.5):
            ''' 
              Return predicted values using the trained model. Also return precision, recall & f-measure
              Input:
              ======
              predict_table_name : (String) the name of the table to be used for prediction
              actual_label_col : (String) the name of the actual label column (will be ignored if empty).
              threshold : (float), the probability beyond which the predicted values will be considered +ve (default: 0.5)
                                 
              Output:
              =======
              A cursor to the row set of the results, including the predicted value as column 'prediction'
            '''
            #If the independent columns specified in the training method were a list (instead of a column name of type array)
            #We should transform the independent columns in the predict table as well
            if(isinstance(self.model['indep_org'],[].__class__)):
                predict_table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,predict_table_name, self.model['indep_org'], actual_label_col)
                #Convert transformed independent columns into an array
                predict_table_name, indep = convertsColsToArray(self.dbconn, predict_table_name, indep, dep)
                
            stmt = ''' '''
            if(threshold):
                stmt = '''
                          select *,
                                case when (1.0/(1.0 + exp(-1.0*madlib.array_dot({indep}, array{coef}::real[])))) > {threshold} 
                                          THEN 1 ELSE 0 
                                end as prediction 
                          from {table_name}
                       '''.format(coef=self.model['coef'],
                                  indep=self.model['indep'],
                                  table_name=predict_table_name,
                                  threshold=threshold
                                 )
            else:
                #If threshold is not specified, we will return actual predictions
                stmt = '''
                          select *,
                                (1.0/(1.0 + exp(-1.0*madlib.array_dot({indep}, array{coef}::real[]))))  as prediction 
                         from {table_name}
                       '''.format(coef=self.model['coef'],
                                  indep=self.model['indep'],
                                  table_name=predict_table_name
                                 )

            print '\nstatement:',stmt
            print '\n'
            
            cursor = self.dbconn.getCursor()
            cursor.execute(stmt)
            return cursor
Ejemplo n.º 3
0
 def train(self, table_name, indep, dep, numIter=100, optimizer='irls',precision=0.001):
     ''' 
       Given train a logistic regression model on the specified table 
       for the given set of independent and dependent variables 
       Inputs :
       ========
       table_name : (String) input table name
       indep : (String) column containing independent variables as an array, to be used to build the model on
                                              OR
               (list) a list of strings, where each element of the list is a column name of table_name or is a constant number                      
       dep : (string) the class label.
       
       Output :
       ========
       The Model coefficients, r2, p_values and t_stats
       The function also returns the model object.            
     '''
     self.model = {}
     #If indep is a list, then the input is specified as a list of columns in a table.
     #1) First, we will transform any categorical columns in this list.
     #2) We will marshal the values from the columns into an array, that can be passed on to MADlib's logistic regression algorithm.
     if(isinstance(indep,[].__class__)):
         self.model['indep_org'] = indep
         table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,table_name, indep, dep)
         #Convert transformed independent columns into an array
         table_name, indep = convertsColsToArray(self.dbconn, table_name, indep, dep)
     else:
         self.model['indep_org'] = indep
        
     self.model['indep'] = indep
     self.model['dep'] = dep
     
     stmt = '''
               select * 
               from {madlib_schema}.logregr('{table_name}','{dep}','{indep}',{numIter}, '{optimizer}', {precision}) 
            '''.format(dep=dep,
                        indep=self.model['indep'],
                        table_name=table_name,
                        numIter=numIter,
                        optimizer=optimizer,
                        precision=precision,
                        madlib_schema=self.dbconn.madlib_schema
                       ) 
     
     logging.info('statement :{0}'.format(stmt))
     mdl_params = psql.read_frame(stmt, self.dbconn.getConnection())
     for param in mdl_params.columns:
         self.model[param] = mdl_params.get(param)[0]
     
     return self.model, mdl_params
Ejemplo n.º 4
0
        def predict(self, predict_table_name,actual_label_col='',threshold=0.5):
            ''' 
              Return predicted values using the trained model. Also return precision, recall & f-measure
              Input:
              ======
              predict_table_name : (String) the name of the table to be used for prediction
              actual_label_col : (String) the name of the actual label column (will be ignored if empty).
              threshold : (float), the probability beyond which the predicted values will be considered +ve (default: 0.5)
                                 
              Output:
              =======
              A cursor to the row set of the results, including the predicted value as column 'prediction'
            '''
            #If the independent columns specified in the training method were a list (instead of a column name of type array)
            #We should transform the independent columns in the predict table as well
            if(isinstance(self.model['indep_org'],[].__class__)):
                predict_table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,predict_table_name, self.model['indep_org'], actual_label_col)
                #Convert transformed independent columns into an array
                predict_table_name, indep = convertsColsToArray(self.dbconn, predict_table_name, indep, dep)
                
            stmt = ''' '''
            if(threshold):
                stmt = '''
                          select *,
                                case when (1.0/(1.0 + exp(-1.0*{madlib_schema}.array_dot({indep}, array{coef}::real[])))) > {threshold} 
                                          THEN 1 ELSE 0 
                                end as prediction 
                          from {table_name}
                       '''.format(coef=self.model['coef'],
                                  indep=self.model['indep'],
                                  table_name=predict_table_name,
                                  threshold=threshold,
                                  madlib_schema=self.dbconn.getMADlibSchema()
                                 )
            else:
                #If threshold is not specified, we will return actual predictions
                stmt = '''
                          select *,
                                (1.0/(1.0 + exp(-1.0*{madlib_schema}.array_dot({indep}, array{coef}::real[]))))  as prediction 
                         from {table_name}
                       '''.format(coef=self.model['coef'],
                                  indep=self.model['indep'],
                                  table_name=predict_table_name,
                                  madlib_schema=self.dbconn.getMADlibSchema()
                                 )

            logging.info('statement:{0}'.format(stmt))
            prediction_results = psql.read_frame(stmt,self.dbconn.getConnection())
            return prediction_results