예제 #1
0
 def add_row(self, column_one_value, row_data):
     if column_one_value not in self.column_one_values:
         raise BIException('Unknown value: "%s" for column' %
                           (column_one_value, ))
     if len(row_data) != len(self.column_two_values):
         raise BIException('Row for: "%s" should have %d values, but has %d values only', \
                           column_one_value, len(self.column_two_values), len(row_data))
     index = self.column_one_values.index(column_one_value)
     self.table[index] = [row_data[i] for i in self._col2_order]
예제 #2
0
    def get_value_column_percent(self, column_one_value, column_two_value):
        if column_one_value not in self.column_one_values:
            raise BIException('Unknown column one value: %s' %
                              (column_one_value, ))
        if column_two_value not in self.column_two_values:
            raise BIException('Unknown column two value: %s' %
                              (column_two_value, ))

        column_one_index = self.column_one_values.index(column_one_value)
        column_two_index = self.column_two_values.index(column_two_value)
        return self.table_percent_by_column[column_one_index][column_two_index]
예제 #3
0
    def correlation(self, column_one, column_two):
        """
        Find correlation between two numeric columns
        :param column_one:
        :param column_two:
        :return:
        """
        if column_one not in self._dataframe_helper.get_numeric_columns():
            raise BIException.non_numeric_column(column_one)

        if column_two not in self._dataframe_helper.get_numeric_columns():
            raise BIException.non_numeric_column(column_two)

        return self._corr(column_one, column_two)
예제 #4
0
    def get_coeff(self, input_column):
        if input_column not in self.input_columns:
            raise BIException('Input column(%s) has no impact on output column(%s)' \
                              %(input_column, self.output_column))

        return self.stats.get(
            RegressionResult.COEFFICIENTS).get(input_column).get(
                RegressionResult.COEFF)
예제 #5
0
    def assert_non_negative_parameter(param_type,
                                      param_name,
                                      param_value,
                                      raise_exception=True):
        if type(param_value) != param_type:
            if raise_exception:
                raise BIException.parameter_invalid_type(
                    param_name, param_type, type(param_value))
            else:
                return False

        if param_value < 0:
            if raise_exception:
                raise BIException.parameter_has_negative_value(
                    param_name, param_value)
            else:
                return False

        return True
예제 #6
0
    def __init__(self, data_frame, column1, column2):
        dataframe_helper = DataFrameHelper(data_frame)

        if not dataframe_helper.is_valid_data_frame():
            raise BIException.dataframe_invalid()

        if not dataframe_helper.has_column(column1):
            raise BIException.column_does_not_exist(column1)
        if not dataframe_helper.is_numeric_column(column1):
            raise BIException.non_numeric_column(column1)

        if not dataframe_helper.has_column(column2):
            raise BIException.column_does_not_exist(column2)
        if not dataframe_helper.is_numeric_column(column2):
            raise BIException.non_numeric_column(column2)

        self._data_frame = data_frame
        self._column1 = column1
        self._column2 = column2
예제 #7
0
    def __init__(self,
                 data_frame,
                 independent_var,
                 dependent_var,
                 independent_var_levels=None):
        """
        :param data_frame:  data frame to use for tests
        :param independent_var: a string type column with at least two levels
        :param dependent_var:   a measure type column
        :param independent_var_levels:  if independent_var has exactly two levels this parameter can be omitted,
                    otherwise two levels in independent_var need to be supplied as a tuple
        """
        dataframe_helper = DataFrameHelper(data_frame)
        # ensure data_frame is valid
        if not dataframe_helper.is_valid_data_frame():
            raise BIException.dataframe_invalid()

        # ensure data_frame contains a column by name independent_var
        if not dataframe_helper.has_column(independent_var):
            raise BIException.column_does_not_exist(independent_var)
        # ensure column, independent_var, is of type string
        if not dataframe_helper.is_string_column(independent_var):
            raise BIException.non_string_column(independent_var)

        # ensure data_frame contains a column by name dependent_var
        if not dataframe_helper.has_column(dependent_var):
            raise BIException.column_does_not_exist(dependent_var)
        # ensure column, dependent_var, is of numeric type
        if not dataframe_helper.is_numeric_column(dependent_var):
            raise BIException.non_numeric_column(dependent_var)

        self._data_frame = data_frame
        self._independent_var = independent_var
        self._dependent_var = dependent_var
        self._independent_var_levels = self._get_independent_var_levels()
        if independent_var_levels != None and type(independent_var_levels) in [
                list, tuple
        ]:
            if len(independent_var_levels) != 2:
                raise BIException(
                    "independent_var_levels should only contain two levels")
            for level in independent_var_levels:
                if level not in self._independent_var_levels:
                    raise BIException('Column, %s, does not have level "%s"' %
                                      (self._independent_var, level))
            self._independent_var_levels = independent_var_levels
        else:
            if len(self._independent_var_levels) != 2:
                raise BIException(
                    'Column, %s, should have exactly two levels, but it has %d levels'
                    %
                    (self._independent_var, len(self._independent_var_levels)))
예제 #8
0
    def fit(self, output_column, input_columns=None):
        print "linear regression fit started"
        if output_column not in self._dataframe_helper.get_numeric_columns():
            raise BIException('Output column: %s is not a measure column' %
                              (output_column, ))

        if input_columns == None:
            input_columns = list(
                set(self._dataframe_helper.get_numeric_columns()) -
                {output_column})

        nColsToUse = self._analysisDict[self._analysisName]["noOfColumnsToUse"]
        if nColsToUse != None:
            input_columns = input_columns[:nColsToUse]
        if len(
                set(input_columns) -
                set(self._dataframe_helper.get_numeric_columns())) != 0:
            raise BIException(
                'At least one of the input columns %r is not a measure column'
                % (input_columns, ))

        all_measures = input_columns + [output_column]
        print all_measures
        measureDf = self._data_frame.select(all_measures)
        lr = LR(maxIter=LinearRegression.MAX_ITERATIONS,
                regParam=LinearRegression.REGULARIZATION_PARAM,
                elasticNetParam=1.0,
                labelCol=LinearRegression.LABEL_COLUMN_NAME,
                featuresCol=LinearRegression.FEATURES_COLUMN_NAME)

        st = time.time()
        pipeline = MLUtils.create_pyspark_ml_pipeline(input_columns, [],
                                                      output_column)
        pipelineModel = pipeline.fit(measureDf)
        training_df = pipelineModel.transform(measureDf)
        training_df = training_df.withColumn("label",
                                             training_df[output_column])
        print "time taken to create training_df", time.time() - st
        # st = time.time()
        # training_df.cache()
        # print "caching in ",time.time()-st
        st = time.time()
        lr_model = lr.fit(training_df)
        lr_summary = lr_model.evaluate(training_df)
        print "lr model summary", time.time() - st
        sample_data_dict = {}
        for input_col in input_columns:
            sample_data_dict[input_col] = None

        coefficients = [
            float(val) if val != None else None
            for val in lr_model.coefficients.values
        ]
        try:
            p_values = [
                float(val) if val != None else None
                for val in lr_model.summary.pValues
            ]
        except:
            p_values = [None] * len(coefficients)
        # print p_values
        # print coefficients
        regression_result = RegressionResult(output_column,
                                             list(set(input_columns)))
        regression_result.set_params(intercept=float(lr_model.intercept),\
                                     coefficients=coefficients,\
                                     p_values = p_values,\
                                     rmse=float(lr_summary.rootMeanSquaredError), \
                                     r2=float(lr_summary.r2),\
                                     sample_data_dict=sample_data_dict)

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._completionStatus += self._scriptWeightDict[
            self._analysisName]["script"]
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "regressionTrainingEnd",\
                                    "info",\
                                    self._scriptStages["regressionTrainingEnd"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        if self._ignoreRegressionElasticityMessages != True:
            CommonUtils.save_progress_message(
                self._messageURL,
                progressMessage,
                ignore=self._ignoreRegressionElasticityMessages)
            self._dataframe_context.update_completion_status(
                self._completionStatus)

        return regression_result
예제 #9
0
    def get_regression_result(self, output_column):
        if output_column not in self.measures:
            raise BIException('No regression result found for column(%s)' %
                              (output_column, ))

        return self.results.get(output_column)
예제 #10
0
    def fit(self, output_column, input_columns=None):
        if output_column not in self._dataframe_helper.get_numeric_columns():
            raise BIException('Output column: %s is not a measure column' % (output_column,))

        if input_columns == None:
            input_columns = list(set(self._dataframe_helper.get_numeric_columns()) - {output_column})

        if len(set(input_columns) - set(self._dataframe_helper.get_numeric_columns())) != 0:
            raise BIException('At least one of the input columns %r is not a measure column' % (input_columns,))

        # TODO: ensure no duplicates are present in input_columns

        regression_result = RegressionResult(output_column, input_columns)

        training_df = self._data_frame.rdd.map(lambda row: \
                                                   (float(row[output_column]),
                                                    DenseVector([float(row[col]) for col in input_columns]))).toDF()

        lr = LR(maxIter=LinearRegression.MAX_ITERATIONS, regParam=LinearRegression.REGULARIZATION_PARAM,
                elasticNetParam=1.0, labelCol=LinearRegression.LABEL_COLUMN_NAME,
                featuresCol=LinearRegression.FEATURES_COLUMN_NAME)

        lr_model = lr.fit(training_df)
        lr_summary = lr_model.evaluate(training_df)

        #regression_result.set_params(intercept=lr_model.intercept, coefficients=lr_model.coefficients,
        #                              rmse=lr_summary.rootMeanSquaredError, r2=lr_summary.r2,
        #                              t_values=lr_summary.tValues, p_values=lr_summary.pValues)

        # TODO: pass t_values and p_values
        coefficients = [float(i) for i in lr_model.coefficients.values]
        if not any([coeff != 0 for coeff in coefficients]):
            return None
        sample_data_dict = {}
        lr_dimension = {}
        for c in input_columns:
            sample_data_dict[c] = None
            lr_dimension[c] = {'dimension':'', 'levels': [], 'coefficients':[],
                                'dimension2':'', 'levels2': [], 'coefficients2':[]}
            diff = 0
            diff2 = 0
            for dim in self._string_columns:
            # sample_data_dict[col] = self._dataframe_helper.get_sample_data(col, output_column, self._sample_size)
                temp = []
                if len(self._levels[dim])>0 and len(self._levels[dim])<16:

                    for level in self._levels[dim]:
                        sub_df = self._data_frame.select(*[c,output_column]).filter(col(dim)==level)
                        train = sub_df.rdd.map(lambda row: (float(row[output_column]),
                                                                    DenseVector([float(row[c])]))).toDF()
                        sub_lr_model = lr.fit(train)
                        temp = temp + [float(i) for i in sub_lr_model.coefficients.values]
                    if max(temp)-min(temp) > diff:
                        diff = max(temp)-min(temp)
                        diff2 = diff
                        lr_dimension[c]['dimension2']= lr_dimension[c]['dimension']
                        lr_dimension[c]['levels2'] = lr_dimension[c]['levels']
                        lr_dimension[c]['coefficients2'] = lr_dimension[c]['coefficients']
                        lr_dimension[c]['dimension'] = dim
                        X = self._levels[dim]
                        Y = temp
                        Z = [abs(y) for y in Y]
                        lr_dimension[c]['levels'] = [x for (z,y,x) in sorted(zip(Z,Y,X))]
                        lr_dimension[c]['coefficients'] = [y for (z,y,x) in sorted(zip(Z,Y,X))]
                    elif max(temp)-min(temp) > diff2:
                        diff2 = max(temp)-min(temp)
                        lr_dimension[c]['dimension2'] = dim
                        X = self._levels[dim]
                        Y = temp
                        Z = [abs(y) for y in Y]
                        lr_dimension[c]['levels2'] = [x for (z,y,x) in sorted(zip(Z,Y,X))]
                        lr_dimension[c]['coefficients2'] = [y for (z,y,x) in sorted(zip(Z,Y,X))]

        regression_result.set_params(intercept=float(lr_model.intercept), coefficients=coefficients,
                                      rmse=float(lr_summary.rootMeanSquaredError), r2=float(lr_summary.r2),
                                      sample_data_dict=sample_data_dict, lr_dimension=lr_dimension)

        return regression_result