def add_row(self, column_one_value, row_data): if column_one_value not in self.column_one_values: raise BIException('Unknown value: "%s" for column' % (column_one_value, )) if len(row_data) != len(self.column_two_values): raise BIException('Row for: "%s" should have %d values, but has %d values only', \ column_one_value, len(self.column_two_values), len(row_data)) index = self.column_one_values.index(column_one_value) self.table[index] = [row_data[i] for i in self._col2_order]
def get_value_column_percent(self, column_one_value, column_two_value): if column_one_value not in self.column_one_values: raise BIException('Unknown column one value: %s' % (column_one_value, )) if column_two_value not in self.column_two_values: raise BIException('Unknown column two value: %s' % (column_two_value, )) column_one_index = self.column_one_values.index(column_one_value) column_two_index = self.column_two_values.index(column_two_value) return self.table_percent_by_column[column_one_index][column_two_index]
def correlation(self, column_one, column_two): """ Find correlation between two numeric columns :param column_one: :param column_two: :return: """ if column_one not in self._dataframe_helper.get_numeric_columns(): raise BIException.non_numeric_column(column_one) if column_two not in self._dataframe_helper.get_numeric_columns(): raise BIException.non_numeric_column(column_two) return self._corr(column_one, column_two)
def get_coeff(self, input_column): if input_column not in self.input_columns: raise BIException('Input column(%s) has no impact on output column(%s)' \ %(input_column, self.output_column)) return self.stats.get( RegressionResult.COEFFICIENTS).get(input_column).get( RegressionResult.COEFF)
def assert_non_negative_parameter(param_type, param_name, param_value, raise_exception=True): if type(param_value) != param_type: if raise_exception: raise BIException.parameter_invalid_type( param_name, param_type, type(param_value)) else: return False if param_value < 0: if raise_exception: raise BIException.parameter_has_negative_value( param_name, param_value) else: return False return True
def __init__(self, data_frame, column1, column2): dataframe_helper = DataFrameHelper(data_frame) if not dataframe_helper.is_valid_data_frame(): raise BIException.dataframe_invalid() if not dataframe_helper.has_column(column1): raise BIException.column_does_not_exist(column1) if not dataframe_helper.is_numeric_column(column1): raise BIException.non_numeric_column(column1) if not dataframe_helper.has_column(column2): raise BIException.column_does_not_exist(column2) if not dataframe_helper.is_numeric_column(column2): raise BIException.non_numeric_column(column2) self._data_frame = data_frame self._column1 = column1 self._column2 = column2
def __init__(self, data_frame, independent_var, dependent_var, independent_var_levels=None): """ :param data_frame: data frame to use for tests :param independent_var: a string type column with at least two levels :param dependent_var: a measure type column :param independent_var_levels: if independent_var has exactly two levels this parameter can be omitted, otherwise two levels in independent_var need to be supplied as a tuple """ dataframe_helper = DataFrameHelper(data_frame) # ensure data_frame is valid if not dataframe_helper.is_valid_data_frame(): raise BIException.dataframe_invalid() # ensure data_frame contains a column by name independent_var if not dataframe_helper.has_column(independent_var): raise BIException.column_does_not_exist(independent_var) # ensure column, independent_var, is of type string if not dataframe_helper.is_string_column(independent_var): raise BIException.non_string_column(independent_var) # ensure data_frame contains a column by name dependent_var if not dataframe_helper.has_column(dependent_var): raise BIException.column_does_not_exist(dependent_var) # ensure column, dependent_var, is of numeric type if not dataframe_helper.is_numeric_column(dependent_var): raise BIException.non_numeric_column(dependent_var) self._data_frame = data_frame self._independent_var = independent_var self._dependent_var = dependent_var self._independent_var_levels = self._get_independent_var_levels() if independent_var_levels != None and type(independent_var_levels) in [ list, tuple ]: if len(independent_var_levels) != 2: raise BIException( "independent_var_levels should only contain two levels") for level in independent_var_levels: if level not in self._independent_var_levels: raise BIException('Column, %s, does not have level "%s"' % (self._independent_var, level)) self._independent_var_levels = independent_var_levels else: if len(self._independent_var_levels) != 2: raise BIException( 'Column, %s, should have exactly two levels, but it has %d levels' % (self._independent_var, len(self._independent_var_levels)))
def fit(self, output_column, input_columns=None): print "linear regression fit started" if output_column not in self._dataframe_helper.get_numeric_columns(): raise BIException('Output column: %s is not a measure column' % (output_column, )) if input_columns == None: input_columns = list( set(self._dataframe_helper.get_numeric_columns()) - {output_column}) nColsToUse = self._analysisDict[self._analysisName]["noOfColumnsToUse"] if nColsToUse != None: input_columns = input_columns[:nColsToUse] if len( set(input_columns) - set(self._dataframe_helper.get_numeric_columns())) != 0: raise BIException( 'At least one of the input columns %r is not a measure column' % (input_columns, )) all_measures = input_columns + [output_column] print all_measures measureDf = self._data_frame.select(all_measures) lr = LR(maxIter=LinearRegression.MAX_ITERATIONS, regParam=LinearRegression.REGULARIZATION_PARAM, elasticNetParam=1.0, labelCol=LinearRegression.LABEL_COLUMN_NAME, featuresCol=LinearRegression.FEATURES_COLUMN_NAME) st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(input_columns, [], output_column) pipelineModel = pipeline.fit(measureDf) training_df = pipelineModel.transform(measureDf) training_df = training_df.withColumn("label", training_df[output_column]) print "time taken to create training_df", time.time() - st # st = time.time() # training_df.cache() # print "caching in ",time.time()-st st = time.time() lr_model = lr.fit(training_df) lr_summary = lr_model.evaluate(training_df) print "lr model summary", time.time() - st sample_data_dict = {} for input_col in input_columns: sample_data_dict[input_col] = None coefficients = [ float(val) if val != None else None for val in lr_model.coefficients.values ] try: p_values = [ float(val) if val != None else None for val in lr_model.summary.pValues ] except: p_values = [None] * len(coefficients) # print p_values # print coefficients regression_result = RegressionResult(output_column, list(set(input_columns))) regression_result.set_params(intercept=float(lr_model.intercept),\ coefficients=coefficients,\ p_values = p_values,\ rmse=float(lr_summary.rootMeanSquaredError), \ r2=float(lr_summary.r2),\ sample_data_dict=sample_data_dict) self._completionStatus = self._dataframe_context.get_completion_status( ) self._completionStatus += self._scriptWeightDict[ self._analysisName]["script"] progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "regressionTrainingEnd",\ "info",\ self._scriptStages["regressionTrainingEnd"]["summary"],\ self._completionStatus,\ self._completionStatus) if self._ignoreRegressionElasticityMessages != True: CommonUtils.save_progress_message( self._messageURL, progressMessage, ignore=self._ignoreRegressionElasticityMessages) self._dataframe_context.update_completion_status( self._completionStatus) return regression_result
def get_regression_result(self, output_column): if output_column not in self.measures: raise BIException('No regression result found for column(%s)' % (output_column, )) return self.results.get(output_column)
def fit(self, output_column, input_columns=None): if output_column not in self._dataframe_helper.get_numeric_columns(): raise BIException('Output column: %s is not a measure column' % (output_column,)) if input_columns == None: input_columns = list(set(self._dataframe_helper.get_numeric_columns()) - {output_column}) if len(set(input_columns) - set(self._dataframe_helper.get_numeric_columns())) != 0: raise BIException('At least one of the input columns %r is not a measure column' % (input_columns,)) # TODO: ensure no duplicates are present in input_columns regression_result = RegressionResult(output_column, input_columns) training_df = self._data_frame.rdd.map(lambda row: \ (float(row[output_column]), DenseVector([float(row[col]) for col in input_columns]))).toDF() lr = LR(maxIter=LinearRegression.MAX_ITERATIONS, regParam=LinearRegression.REGULARIZATION_PARAM, elasticNetParam=1.0, labelCol=LinearRegression.LABEL_COLUMN_NAME, featuresCol=LinearRegression.FEATURES_COLUMN_NAME) lr_model = lr.fit(training_df) lr_summary = lr_model.evaluate(training_df) #regression_result.set_params(intercept=lr_model.intercept, coefficients=lr_model.coefficients, # rmse=lr_summary.rootMeanSquaredError, r2=lr_summary.r2, # t_values=lr_summary.tValues, p_values=lr_summary.pValues) # TODO: pass t_values and p_values coefficients = [float(i) for i in lr_model.coefficients.values] if not any([coeff != 0 for coeff in coefficients]): return None sample_data_dict = {} lr_dimension = {} for c in input_columns: sample_data_dict[c] = None lr_dimension[c] = {'dimension':'', 'levels': [], 'coefficients':[], 'dimension2':'', 'levels2': [], 'coefficients2':[]} diff = 0 diff2 = 0 for dim in self._string_columns: # sample_data_dict[col] = self._dataframe_helper.get_sample_data(col, output_column, self._sample_size) temp = [] if len(self._levels[dim])>0 and len(self._levels[dim])<16: for level in self._levels[dim]: sub_df = self._data_frame.select(*[c,output_column]).filter(col(dim)==level) train = sub_df.rdd.map(lambda row: (float(row[output_column]), DenseVector([float(row[c])]))).toDF() sub_lr_model = lr.fit(train) temp = temp + [float(i) for i in sub_lr_model.coefficients.values] if max(temp)-min(temp) > diff: diff = max(temp)-min(temp) diff2 = diff lr_dimension[c]['dimension2']= lr_dimension[c]['dimension'] lr_dimension[c]['levels2'] = lr_dimension[c]['levels'] lr_dimension[c]['coefficients2'] = lr_dimension[c]['coefficients'] lr_dimension[c]['dimension'] = dim X = self._levels[dim] Y = temp Z = [abs(y) for y in Y] lr_dimension[c]['levels'] = [x for (z,y,x) in sorted(zip(Z,Y,X))] lr_dimension[c]['coefficients'] = [y for (z,y,x) in sorted(zip(Z,Y,X))] elif max(temp)-min(temp) > diff2: diff2 = max(temp)-min(temp) lr_dimension[c]['dimension2'] = dim X = self._levels[dim] Y = temp Z = [abs(y) for y in Y] lr_dimension[c]['levels2'] = [x for (z,y,x) in sorted(zip(Z,Y,X))] lr_dimension[c]['coefficients2'] = [y for (z,y,x) in sorted(zip(Z,Y,X))] regression_result.set_params(intercept=float(lr_model.intercept), coefficients=coefficients, rmse=float(lr_summary.rootMeanSquaredError), r2=float(lr_summary.r2), sample_data_dict=sample_data_dict, lr_dimension=lr_dimension) return regression_result