Beispiel #1
0
    def test(self):
        """
        Perform Independent sample t-test
        :return:
        """
        indep_col = FN.col(self._independent_var)
        dep_col = FN.col(self._dependent_var)
        sample1 = self._data_frame.select(dep_col).filter(
            indep_col == self._independent_var_levels[0])
        sample2 = self._data_frame.select(dep_col).filter(
            indep_col == self._independent_var_levels[1])

        sample1_size = sample1.count()
        sample2_size = sample2.count()

        sample1_variance = Stats.variance(sample1, self._dependent_var)
        sample2_variance = Stats.variance(sample2, self._dependent_var)

        if sample1_variance == sample2_variance:
            if sample1_size == sample2_size:
                return self._ttest_equal_size_samples_with_same_variance(
                    sample1_size, sample1, sample2, sample1_variance,
                    sample2_variance)
            else:
                return self._ttest_unequal_size_samples_with_same_variance(
                    sample1, sample2, sample1_variance, sample2_variance)

        return self._ttest_with_different_sample_variances(
            sample1, sample2, sample1_variance, sample2_variance)
Beispiel #2
0
    def _ttest_unequal_size_samples_with_same_variance(self, sample1, sample2,
                                                       sample1_variance,
                                                       sample2_variance):
        sample1_size = sample1.count()
        sample2_size = sample2.count()
        sample1_mean = Stats.mean(sample1, self._dependent_var)
        sample2_mean = Stats.mean(sample2, self._dependent_var)
        degrees_of_freedom = sample1_size + sample2_size - 2
        pooled_std_dev = math.sqrt(
            ((sample1_size - 1) * sample1_variance +
             (sample2_size - 1) * sample2_variance) / degrees_of_freedom)
        std_err = pooled_std_dev * math.sqrt((1 / sample1_size) +
                                             (1 / sample2_size))
        t_value = (sample1_mean - sample2_mean) / std_err
        p_value = Stats.t_distribution_critical_value(t_value,
                                                      df=degrees_of_freedom)

        return IndependentSampleTTestResult(
            indep_variable=self._independent_var,
            dep_variable=self._dependent_var,
            sample1_level=self._independent_var_levels[0],
            sample1_mean=sample1_mean,
            sample1_variance=sample1_variance,
            sample2_level=self._independent_var_levels[1],
            sample2_mean=sample2_mean,
            sample2_variance=sample2_variance,
            t_value=t_value,
            p_value=p_value,
            df=degrees_of_freedom)
Beispiel #3
0
    def _ttest_with_different_sample_variances(self, sample1, sample2,
                                               sample1_variance,
                                               sample2_variance):
        # Welch's t-test
        sample1_size = sample1.count()
        sample2_size = sample2.count()
        sample1_mean = Stats.mean(sample1, self._dependent_var)
        sample2_mean = Stats.mean(sample2, self._dependent_var)
        degrees_of_freedom = (math.pow(
            (sample1_variance / sample1_size) +
            (sample2_variance / sample2_size), 2)) / (
                (math.pow(sample1_variance, 2) / (math.pow(sample1_size, 2) *
                                                  (sample1_size - 1))) +
                (math.pow(sample2_variance, 2) / (math.pow(sample2_size, 2) *
                                                  (sample2_size - 1))))
        t_value = (sample1_mean - sample2_mean) / math.sqrt(
            (sample1_variance / sample1_size) +
            (sample2_variance / sample2_size))
        p_value = Stats.t_distribution_critical_value(t_value,
                                                      df=degrees_of_freedom)

        return IndependentSampleTTestResult(
            indep_variable=self._independent_var,
            dep_variable=self._dependent_var,
            sample1_level=self._independent_var_levels[0],
            sample1_mean=sample1_mean,
            sample1_variance=sample1_variance,
            sample2_level=self._independent_var_levels[1],
            sample2_mean=sample2_mean,
            sample2_variance=sample2_variance,
            t_value=t_value,
            p_value=p_value,
            df=degrees_of_freedom)
Beispiel #4
0
    def _ttest_equal_size_samples_with_same_variance(self, sample_size,
                                                     sample1, sample2,
                                                     sample1_variance,
                                                     sample2_variance):
        sample1_mean = Stats.mean(sample1, self._dependent_var)
        sample2_mean = Stats.mean(sample2, self._dependent_var)
        pooled_standard_deviation = math.sqrt(
            (sample1_variance + sample2_variance) / 2)
        standard_error = pooled_standard_deviation * math.sqrt(
            2.0 / sample_size)
        t_value = (sample1_mean - sample2_mean) / standard_error
        degrees_of_freedom = 2 * sample_size - 2
        p_value = Stats.t_distribution_critical_value(t_value,
                                                      df=degrees_of_freedom)

        return IndependentSampleTTestResult(
            indep_variable=self._independent_var,
            dep_variable=self._dependent_var,
            sample1_level=self._independent_var_levels[0],
            sample1_mean=sample1_mean,
            sample1_variance=sample1_variance,
            sample2_level=self._independent_var_levels[1],
            sample2_mean=sample2_mean,
            sample2_variance=sample2_variance,
            t_value=t_value,
            p_value=p_value,
            df=degrees_of_freedom)
Beispiel #5
0
    def feat_importance_linear(self):
        linear_list=[]
        if self._pandas_flag:
            le=LabelEncoder()
            X_train = self.data_frame.drop(self.target, axis=1)
            if self.problem_type !='REGRESSION':
                try:
                    Y_train=le.fit_transform(self.data_frame[self.target])
                except:
                    Y_train = le.fit_transform(self.data_frame[self.target].astype(str))
            else:
                Y_train = self.data_frame[self.target]
            X_train = X_train[X_train._get_numeric_data().columns]

            for c in list(X_train.columns):
                pearson_coef, p_value = stats.pearsonr(X_train[c],Y_train)
                if p_value < 0.05 :
                    linear_list.append(c)
        else:
            if self.problem_type !='REGRESSION':
                indexer = StringIndexer(inputCol=self.target, outputCol="label")
                indexed = indexer.fit(self.data_frame).transform(self.data_frame)
                X_train = indexed.drop('label')
                num_var = [i[0] for i in X_train.dtypes if ((i[1]=='int') | (i[1]=='double'))]
                num_of_samples = indexed.select(num_var[0]).count()
                for column_one in num_var:
                    corr = indexed.corr(column_one, 'label')
                    # num_of_samples = indexed.select(column_one).count()
                    df = num_of_samples - 2
                    std_error = math.sqrt(old_div((1 - math.pow(corr, 2)), df))
                    t_value = old_div(corr, std_error)
                    p_value = Stats.t_distribution_critical_value(t_value, df=df)
                    if p_value < 0.05 :
                        linear_list.append(column_one)
            else:
                X_train = self.data_frame.drop(self.target)
                num_var = [i[0] for i in X_train.dtypes if ((i[1]=='int') | (i[1]=='double'))]
                for column_one in num_var:
                    corr = self.data_frame.corr(column_one, self.target)
                    num_of_samples = self.data_frame.select(column_one).count()
                    df = num_of_samples - 2
                    std_error = math.sqrt(old_div((1 - math.pow(corr, 2)), df))
                    t_value = old_div(corr, std_error)
                    p_value = Stats.t_distribution_critical_value(t_value, df=df)
                    if p_value < 0.05 :
                        linear_list.append(column_one)
        self.data_change_dict['SelectedColsLinear'] = linear_list
        linear_list.append(self.target)
        return linear_list
Beispiel #6
0
    def _corr(self, column_one, column_two):
        """
        Finds correlation between two columns, also calculates
            a) statistical significance info
            b) effect size info - coefficient of determination, and
            c) confidence intervals.

        :param column_one:
        :param column_two:
        :return:
        """
        corr = self._data_frame.corr(column_one, column_two)
        num_of_samples = self._data_frame.select(column_one).count()
        df = num_of_samples - 2
        std_error = math.sqrt((1 - math.pow(corr, 2)) / df)
        t_value = corr / std_error
        p_value = Stats.t_distribution_critical_value(t_value, df=df)
        coeff_determination = math.pow(corr, 2)

        corr_stats = CorrelationStats(correlation=corr, std_error=std_error, t_value=t_value, p_value=p_value,
                                      degrees_of_freedom=df, coeff_determination=coeff_determination)
        for alpha in ALPHA_LEVELS:
            (lower_bound, upper_bound) = self._confidence_interval(corr, num_of_samples, alpha)
            corr_stats.set_confidence_interval(alpha, lower_bound, upper_bound)

        return corr_stats
Beispiel #7
0
    def test(self):
        column1 = FN.col(self._column1)
        column2 = FN.col(self._column2)
        diff_column_name = 'diff'
        diff_expr = (column2 - column1).alias(diff_column_name)
        sample_of_differences = self._data_frame.select(diff_expr)
        sample_size = sample_of_differences.count()
        sample_mean = Stats.mean(sample_of_differences, diff_column_name)
        sample_sd = Stats.standard_deviation(sample_of_differences, diff_column_name)
        t_value = float(sample_mean) / (old_div(sample_sd, math.sqrt(sample_size)))
        degree_of_freedom = sample_size - 1
        p_value = Stats.t_distribution_critical_value(t_value, df=degree_of_freedom)

        return DependentSampleTtestResult(column1=self._column1, column2=self._column2, sample_size=sample_size,
                                          mean_of_differences=sample_mean, df=degree_of_freedom, t_value=t_value,
                                          p_value=p_value)
 def get_measure_column_splits(self, df, colname, n_split=5):
     """
     n_split = number of splits required -1
     splits = [0.0, 23.0, 46.0, 69.0, 92.0, 115.0]
     splits_range = [(0.0, 23.0), (23.0, 46.0), (46.0, 69.0), (69.0, 92.0), (92.0, 115.0)]
     """
     n_split = 5
     minimum_val = Stats.min(df, colname)
     maximum_val = Stats.max(df, colname)
     splits = CommonUtils.frange(minimum_val,
                                 maximum_val,
                                 num_steps=n_split)
     splits = sorted(splits)
     splits_range = [(splits[idx], splits[idx + 1])
                     for idx in range(len(splits) - 1)]
     output = {"splits": splits, "splits_range": splits_range}
     return output
    def generateClusterDataDict(self, measure_column, kmeans_result):
        kmeans_stats = kmeans_result["stats"]
        input_columns = kmeans_stats["inputCols"]
        kmeans_df = kmeans_result["data"]
        cluster_data_dict = {"chart_data": None, "grp_data": None}
        grp_df = kmeans_df.groupBy("prediction").count().toPandas()
        grp_counts = list(zip(grp_df["prediction"], grp_df["count"]))
        grp_counts = sorted(grp_counts, key=lambda x: x[1], reverse=True)
        grp_dict = dict(grp_counts)

        colors = ["red", "blue", "green", "yellow", "black"]
        cluster_ids = list(grp_df["prediction"])
        color_dict = dict(list(zip(cluster_ids, colors[:len(cluster_ids)])))

        chart_data = {"heading": "", "data": []}
        result_col_data = [self._result_column]
        measure_col_data = [measure_column]
        color_data = ["Colors"]
        plot_labels = ["Cluster Labels"]

        grp_data = []
        total = float(sum(grp_dict.values()))
        for grp_id in list(grp_df["prediction"]):
            data = {}
            data["group_number"] = grp_id + 1
            data["count"] = grp_dict[grp_id]
            data["contribution"] = round(
                old_div(grp_dict[grp_id] * 100, total), 2)
            df = kmeans_df.filter(FN.col("prediction") == grp_id)
            data["columns"] = dict(
                list(zip(input_columns, [{}] * len(input_columns))))
            for val in input_columns:
                data["columns"][val]["avg"] = round(Stats.mean(df, val), 2)
            grp_data.append(data)
            # preparing chart data
            grp_result_data = [
                x[0] for x in df.select(self._result_column).collect()
            ]
            result_col_data += grp_result_data
            grp_measure_data = [
                x[0] for x in df.select(measure_column).collect()
            ]
            measure_col_data += grp_measure_data
            color_list = [color_dict[grp_id]] * len(grp_measure_data)
            color_data += color_list
            label_list = ["Cluster " + str(int(grp_id))]
            plot_labels += label_list

        grp_data = sorted(grp_data,
                          key=lambda x: x["contribution"],
                          reverse=True)
        chart_data = [
            measure_col_data, result_col_data, color_data, plot_labels
        ]
        cluster_data_dict["grp_data"] = grp_data
        cluster_data_dict["chart_data"] = chart_data
        return cluster_data_dict
Beispiel #10
0
    def _confidence_interval(self, correlation, num_samples, alpha):
        """
        Finds confidence interval for correlation at given alpha level

        Ref: http://www2.sas.com/proceedings/sugi31/170-31.pdf

        :param correlation:
        :param num_samples:
        :param alpha:
        :return: tuple (lower_bound, upper_bound)
        """
        normalized_correlation = 0.5 * math.log(float(1 + correlation) / (1 - correlation))
        std_dev = math.sqrt(1.0 / (num_samples - 3))
        normalized_lowerbound = normalized_correlation - Stats.normal_distribution_percentile_point_function(
            alpha) * std_dev
        normalized_upperbound = normalized_correlation + Stats.normal_distribution_percentile_point_function(
            alpha) * std_dev
        return (math.tanh(normalized_lowerbound), math.tanh(normalized_upperbound))
Beispiel #11
0
 def remove_outliers(self, df, outlier_removal_col):
     '''Need to check how it will affect multiple columns'''
     outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z(
         self._data_frame, outlier_removal_col)
     df = self._data_frame.filter(
         self._data_frame[outlier_removal_col] > ol_lower_range)
     df = self._data_frame.filter(
         self._data_frame[outlier_removal_col] < ol_upper_range)
     return df
 def run_regression(self, df, measure_column):
     output = {}
     result_column = self._result_column
     result = LinearRegression(df, self._dataframe_helper,
                               self._dataframe_context, self._metaParser,
                               self._spark).fit(result_column)
     result = {
         "intercept": result.get_intercept(),
         "rmse": result.get_root_mean_square_error(),
         "rsquare": result.get_rsquare(),
         "coeff": result.get_all_coeff()
     }
     if measure_column in result["coeff"].keys():
         output["coeff"] = result["coeff"][measure_column]["coefficient"]
         try:
             output["elasticity_value"] = output["coeff"] * Stats.mean(
                 df, result_column) / Stats.mean(df, measure_column)
         except:
             output["elasticity_value"] = 0
     else:
         output["coeff"] = 0
         output["elasticity_value"] = 0
     return output
Beispiel #13
0
 def cap_outliers(self, outlier_replacement_col):
     outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z(
         self._data_frame, outlier_replacement_col)
     df_dup = self._data_frame
     self._data_frame = df_dup.withColumn(
         outlier_replacement_col,
         when((df_dup[outlier_replacement_col] < ol_lower_range),
              ol_lower_range).otherwise(df_dup[outlier_replacement_col]))
     self._data_frame = self._data_frame.withColumn(
         outlier_replacement_col,
         when((self._data_frame[outlier_replacement_col] > ol_upper_range),
              ol_upper_range).otherwise(
                  self._data_frame[outlier_replacement_col]))
     return self._data_frame
Beispiel #14
0
 def mode_impute_outliers(self, outlier_imputation_col):
     outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z(
         self._data_frame, outlier_imputation_col)
     # df_dup = self._data_frame
     df_without_outliers = self.remove_outliers(self._data_frame,
                                                outlier_imputation_col)
     mode_without_outliers = self.get_mode(
         self._data_frame, df_without_outliers[outlier_imputation_col])
     self._data_frame = self._data_frame.withColumn(
         outlier_imputation_col,
         when((self._data_frame[outlier_imputation_col] < ol_lower_range) |
              (self._data_frame[outlier_imputation_col] > ol_upper_range),
              mode_without_outliers).otherwise(
                  self._data_frame[outlier_imputation_col]))
     return self._data_frame
Beispiel #15
0
 def mean_impute_outliers(self, outlier_imputation_col):
     outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z(
         self._data_frame, outlier_imputation_col)
     # df_dup = self._data_frame
     df_without_outliers = self.remove_outliers(self._data_frame,
                                                outlier_imputation_col)
     mean_without_outliers = df_without_outliers.agg(
         avg(outlier_imputation_col)).first()[0]
     self._data_frame = self._data_frame.withColumn(
         outlier_imputation_col,
         when((self._data_frame[outlier_imputation_col] < ol_lower_range) |
              (self._data_frame[outlier_imputation_col] > ol_upper_range),
              mean_without_outliers).otherwise(
                  self._data_frame[outlier_imputation_col]))
     return self._data_frame
    def generate_card4_data(self, col1, col2):
        #col1 result_column col2 is measure column
        fs = time.time()
        data_dict = {}
        significant_dimensions = self._dataframe_helper.get_significant_dimension(
        )
        print()
        print("-" * 100)
        print("Target Column : ", col1)
        print("Measure Column : ", col2)
        print("significant_dimensions : ", significant_dimensions)
        if significant_dimensions != {}:
            sig_dims = [(x, significant_dimensions[x])
                        for x in list(significant_dimensions.keys())]
            sig_dims = sorted(sig_dims, key=lambda x: x[1], reverse=True)
            cat_columns = [x[0] for x in sig_dims[:10]]
        else:
            cat_columns = self._dataframe_helper.get_string_columns()[:10]

        if not self._pandas_flag:
            col1_mean = Stats.mean(self._data_frame, col1)
            col2_mean = Stats.mean(self._data_frame, col2)
        else:
            col1_mean = self._data_frame[col1].mean()
            col2_mean = self._data_frame[col2].mean()
        print("col1=>", col1, " | col2=>", col2)
        print(col1_mean, col2_mean)
        if not self._pandas_flag:
            low1low2 = self._data_frame.filter(
                FN.col(col1) < col1_mean).filter(FN.col(col2) < col2_mean)
            low1high2 = self._data_frame.filter(
                FN.col(col1) < col1_mean).filter(FN.col(col2) >= col2_mean)
            high1high2 = self._data_frame.filter(
                FN.col(col1) >= col1_mean).filter(FN.col(col2) >= col2_mean)
            high1low2 = self._data_frame.filter(
                FN.col(col1) >= col1_mean).filter(FN.col(col2) < col2_mean)
            low1low2Count = low1low2.count()
            low1high2Count = low1high2.count()
            high1high2Count = high1high2.count()
            high1low2Count = high1low2.count()
        else:
            low1low2 = self._data_frame[(self._data_frame[col1] < col1_mean)
                                        & (self._data_frame[col2] < col2_mean)]
            low1high2 = self._data_frame[(self._data_frame[col1] < col1_mean) &
                                         (self._data_frame[col2] >= col2_mean)]
            high1high2 = self._data_frame[
                (self._data_frame[col1] >= col1_mean)
                & (self._data_frame[col2] >= col2_mean)]
            high1low2 = self._data_frame[(self._data_frame[col1] >= col1_mean)
                                         &
                                         (self._data_frame[col2] < col2_mean)]
            low1low2Count = low1low2.shape[0]
            low1high2Count = low1high2.shape[0]
            high1high2Count = high1high2.shape[0]
            high1low2Count = high1low2.shape[0]
        contribution = {}
        freq = {}
        elasticity_dict = {}
        print("low1low2:", low1low2Count)
        print("low1high2:", low1high2Count)
        print("high1high2:", high1high2Count)
        print("high1low2:", high1low2Count)
        print("quadrant dataframe creation Done in ",
              time.time() - fs, " seconds.")

        dfs = []
        labels = []
        if low1low2Count > 0:
            fs = time.time()
            freq["low1low2"] = self.get_freq_dict(low1low2, cat_columns)[:3]
            print("get_freq_dict Analysis Done in ",
                  time.time() - fs, " seconds.")
            if not self._pandas_flag:
                contribution["low1low2"] = str(
                    round(
                        old_div(low1low2Count * 100,
                                self._data_frame.count()))) + "%"
            else:
                contribution["low1low2"] = str(
                    round(
                        low1low2Count * 100 / self._data_frame.shape[0])) + "%"
            fs = time.time()
            elasticity_dict["low1low2"] = self.run_regression(low1low2, col2)
            print("run_regression(elasticity) Analysis Done in ",
                  time.time() - fs, " seconds.")
            dfs.append("low1low2")
            labels.append("Low %s with Low %s" % (col1, col2))
        if low1high2Count > 0:
            fs = time.time()
            freq["low1high2"] = self.get_freq_dict(low1high2, cat_columns)[:3]
            print("get_freq_dict Analysis Done in ",
                  time.time() - fs, " seconds.")
            if not self._pandas_flag:
                contribution["low1high2"] = str(
                    round(
                        old_div(low1high2Count * 100,
                                self._data_frame.count()))) + "%"
            else:
                contribution["low1high2"] = str(
                    round(low1high2Count * 100 /
                          self._data_frame.shape[0])) + "%"
            fs = time.time()
            elasticity_dict["low1high2"] = self.run_regression(low1high2, col2)
            print("run_regression(elasticity) Analysis Done in ",
                  time.time() - fs, " seconds.")
            dfs.append("low1high2")
            labels.append("Low %s with High %s" % (col1, col2))
        if high1high2Count > 0:
            fs = time.time()
            freq["high1high2"] = self.get_freq_dict(high1high2,
                                                    cat_columns)[:3]
            print("get_freq_dict Analysis Done in ",
                  time.time() - fs, " seconds.")
            if not self._pandas_flag:
                contribution["high1high2"] = str(
                    round(
                        old_div(high1high2Count * 100,
                                self._data_frame.count()))) + "%"
            else:
                contribution["high1high2"] = str(
                    round(high1high2Count * 100 /
                          self._data_frame.shape[0])) + "%"
            fs = time.time()
            elasticity_dict["high1high2"] = self.run_regression(
                high1high2, col2)
            print("run_regression(elasticity) Analysis Done in ",
                  time.time() - fs, " seconds.")
            dfs.append("high1high2")
            labels.append("High %s with High %s" % (col1, col2))
        if high1low2Count > 0:
            fs = time.time()
            freq["high1low2"] = self.get_freq_dict(high1low2, cat_columns)[:3]
            print("get_freq_dict Analysis Done in ",
                  time.time() - fs, " seconds.")
            if not self._pandas_flag:
                contribution["high1low2"] = str(
                    round(
                        old_div(high1low2Count * 100,
                                self._data_frame.count()))) + "%"
            else:
                contribution["high1low2"] = str(
                    round(high1low2Count * 100 /
                          self._data_frame.shape[0])) + "%"
            fs = time.time()
            elasticity_dict["high1low2"] = self.run_regression(high1low2, col2)
            print("run_regression(elasticity) Analysis Done in ",
                  time.time() - fs, " seconds.")
            dfs.append("high1low2")
            labels.append("High %s with Low %s" % (col1, col2))
        fs = time.time()
        # overall_coeff = self._regression_result.get_coeff(col2)
        overall_coeff = self._regression_result.get_all_coeff(
        )[col2]["coefficient"]
        if not self._pandas_flag:
            elasticity_value = old_div(
                overall_coeff * Stats.mean(self._data_frame, col1),
                Stats.mean(self._data_frame, col2))
        else:
            elasticity_value = old_div(
                overall_coeff * self._data_frame[col1].mean(),
                self._data_frame[col2].mean())
        data_dict["overall_elasticity"] = elasticity_value
        label_dict = dict(list(zip(dfs, labels)))

        data_dict["measure_column"] = col2
        data_dict["result_column"] = col1
        data_dict["label_dict"] = label_dict
        data_dict["elastic_grp_list"] = []
        data_dict["inelastic_grp_list"] = []
        data_dict["elastic_count"] = 0
        data_dict["inelastic_count"] = 0
        for val in dfs:
            elastic_data = elasticity_dict[val]
            if elastic_data["elasticity_value"] > 1:
                data_dict["elastic_count"] += 1
                data_dict["elastic_grp_list"].append(
                    (label_dict[val], elastic_data["elasticity_value"]))
            else:
                data_dict["inelastic_count"] += 1
                data_dict["inelastic_grp_list"].append(
                    (label_dict[val], elastic_data["elasticity_value"]))

        data_dict["freq"] = freq
        data_dict["contribution"] = contribution
        data_dict["charts"] = {"heading": "", "data": []}

        col1_data = [col1]
        col2_data = [col2]
        color_data = ["Colors"]
        plotColors = []
        if low1low2Count > 0:
            sample_rows = min(100.0, float(low1low2Count))
            if not self._pandas_flag:
                low1low2 = low1low2.sample(False,
                                           old_div(sample_rows, low1low2Count),
                                           seed=50)
                low1low2_col1 = [x[0] for x in low1low2.select(col1).collect()]
                low1low2_col2 = [x[0] for x in low1low2.select(col2).collect()]
            else:
                low1low2 = low1low2.sample(replace=False,
                                           frac=old_div(
                                               sample_rows, low1low2Count),
                                           random_state=50)
                low1low2_col1 = low1low2[col1].tolist()
                low1low2_col2 = low1low2[col2].tolist()
            low1low2_color = ["#DD2E1F"] * len(low1low2_col2)
            col1_data += low1low2_col1
            col2_data += low1low2_col2
            color_data += low1low2_color
            plotColors.append("#DD2E1F")
        if low1high2Count > 0:
            sample_rows = min(100.0, float(low1high2Count))
            if not self._pandas_flag:
                low1high2 = low1high2.sample(False,
                                             old_div(sample_rows,
                                                     low1high2Count),
                                             seed=50)
                low1high2_col1 = [
                    x[0] for x in low1high2.select(col1).collect()
                ]
                low1high2_col2 = [
                    x[0] for x in low1high2.select(col2).collect()
                ]
            else:
                low1high2 = low1high2.sample(replace=False,
                                             frac=old_div(
                                                 sample_rows, low1high2Count),
                                             random_state=50)
                low1high2_col1 = low1high2[col1].tolist()
                low1high2_col2 = low1high2[col2].tolist()
            low1high2_color = ["#7C5BBB"] * len(low1high2_col2)
            col1_data += low1high2_col1
            col2_data += low1high2_col2
            color_data += low1high2_color
            plotColors.append("#7C5BBB")
        if high1high2Count > 0:
            sample_rows = min(100.0, float(high1high2Count))
            if not self._pandas_flag:
                high1high2 = high1high2.sample(False,
                                               old_div(sample_rows,
                                                       high1high2Count),
                                               seed=50)
                high1high2_col1 = [
                    x[0] for x in high1high2.select(col1).collect()
                ]
                high1high2_col2 = [
                    x[0] for x in high1high2.select(col2).collect()
                ]
            else:
                high1high2 = high1high2.sample(replace=False,
                                               frac=old_div(
                                                   sample_rows,
                                                   high1high2Count),
                                               random_state=50)
                high1high2_col1 = high1high2[col1].tolist()
                high1high2_col2 = high1high2[col2].tolist()
            high1high2_color = ["#00AEB3"] * len(high1high2_col2)
            col1_data += high1high2_col1
            col2_data += high1high2_col2
            color_data += high1high2_color
            plotColors.append("#00AEB3")
        if high1low2Count > 0:
            sample_rows = min(100.0, float(high1low2Count))
            if not self._pandas_flag:
                high1low2 = high1low2.sample(False,
                                             old_div(sample_rows,
                                                     high1low2Count),
                                             seed=50)
                high1low2_col1 = [
                    x[0] for x in high1low2.select(col1).collect()
                ]
                high1low2_col2 = [
                    x[0] for x in high1low2.select(col2).collect()
                ]
            else:
                high1low2 = high1low2.sample(replace=False,
                                             frac=old_div(
                                                 sample_rows, high1low2Count),
                                             random_state=50)
                high1low2_col1 = high1low2[col1].tolist()
                high1low2_col2 = high1low2[col2].tolist()
            high1low2_color = ["#EC640C"] * len(high1low2_col2)
            col1_data += high1low2_col1
            col2_data += high1low2_col2
            color_data += high1low2_color
            plotColors.append("#EC640C")

        plot_labels = dict(list(zip(plotColors, labels)))
        all_data = sorted(zip(col2_data[1:], col1_data[1:], color_data[1:]),
                          key=lambda x: x[1])

        scatterData = ScatterChartData()
        data_obj = dict(list(zip(labels, [[] for i in range(len(labels))])))
        for val in all_data:
            col = val[2]
            obj = {col1: val[1], col2: val[0]}
            key = plot_labels[col]
            data_obj[key].append(obj)
        scatterData.set_data(data_obj)
        scatterChart = ChartJson()
        scatterChart.set_data(scatterData.get_data())
        scatterChart.set_legend(plot_labels)
        scatterChart.set_label_text({"x": col2, "y": col1})
        scatterChart.set_axes({"x": col2, "y": col1})
        scatterChart.set_chart_type("scatter")
        data_dict["charts"] = scatterChart
        print("dsa Analysis Done in ", time.time() - fs, " seconds.")
        return data_dict