def test(self): """ Perform Independent sample t-test :return: """ indep_col = FN.col(self._independent_var) dep_col = FN.col(self._dependent_var) sample1 = self._data_frame.select(dep_col).filter( indep_col == self._independent_var_levels[0]) sample2 = self._data_frame.select(dep_col).filter( indep_col == self._independent_var_levels[1]) sample1_size = sample1.count() sample2_size = sample2.count() sample1_variance = Stats.variance(sample1, self._dependent_var) sample2_variance = Stats.variance(sample2, self._dependent_var) if sample1_variance == sample2_variance: if sample1_size == sample2_size: return self._ttest_equal_size_samples_with_same_variance( sample1_size, sample1, sample2, sample1_variance, sample2_variance) else: return self._ttest_unequal_size_samples_with_same_variance( sample1, sample2, sample1_variance, sample2_variance) return self._ttest_with_different_sample_variances( sample1, sample2, sample1_variance, sample2_variance)
def _ttest_unequal_size_samples_with_same_variance(self, sample1, sample2, sample1_variance, sample2_variance): sample1_size = sample1.count() sample2_size = sample2.count() sample1_mean = Stats.mean(sample1, self._dependent_var) sample2_mean = Stats.mean(sample2, self._dependent_var) degrees_of_freedom = sample1_size + sample2_size - 2 pooled_std_dev = math.sqrt( ((sample1_size - 1) * sample1_variance + (sample2_size - 1) * sample2_variance) / degrees_of_freedom) std_err = pooled_std_dev * math.sqrt((1 / sample1_size) + (1 / sample2_size)) t_value = (sample1_mean - sample2_mean) / std_err p_value = Stats.t_distribution_critical_value(t_value, df=degrees_of_freedom) return IndependentSampleTTestResult( indep_variable=self._independent_var, dep_variable=self._dependent_var, sample1_level=self._independent_var_levels[0], sample1_mean=sample1_mean, sample1_variance=sample1_variance, sample2_level=self._independent_var_levels[1], sample2_mean=sample2_mean, sample2_variance=sample2_variance, t_value=t_value, p_value=p_value, df=degrees_of_freedom)
def _ttest_with_different_sample_variances(self, sample1, sample2, sample1_variance, sample2_variance): # Welch's t-test sample1_size = sample1.count() sample2_size = sample2.count() sample1_mean = Stats.mean(sample1, self._dependent_var) sample2_mean = Stats.mean(sample2, self._dependent_var) degrees_of_freedom = (math.pow( (sample1_variance / sample1_size) + (sample2_variance / sample2_size), 2)) / ( (math.pow(sample1_variance, 2) / (math.pow(sample1_size, 2) * (sample1_size - 1))) + (math.pow(sample2_variance, 2) / (math.pow(sample2_size, 2) * (sample2_size - 1)))) t_value = (sample1_mean - sample2_mean) / math.sqrt( (sample1_variance / sample1_size) + (sample2_variance / sample2_size)) p_value = Stats.t_distribution_critical_value(t_value, df=degrees_of_freedom) return IndependentSampleTTestResult( indep_variable=self._independent_var, dep_variable=self._dependent_var, sample1_level=self._independent_var_levels[0], sample1_mean=sample1_mean, sample1_variance=sample1_variance, sample2_level=self._independent_var_levels[1], sample2_mean=sample2_mean, sample2_variance=sample2_variance, t_value=t_value, p_value=p_value, df=degrees_of_freedom)
def _ttest_equal_size_samples_with_same_variance(self, sample_size, sample1, sample2, sample1_variance, sample2_variance): sample1_mean = Stats.mean(sample1, self._dependent_var) sample2_mean = Stats.mean(sample2, self._dependent_var) pooled_standard_deviation = math.sqrt( (sample1_variance + sample2_variance) / 2) standard_error = pooled_standard_deviation * math.sqrt( 2.0 / sample_size) t_value = (sample1_mean - sample2_mean) / standard_error degrees_of_freedom = 2 * sample_size - 2 p_value = Stats.t_distribution_critical_value(t_value, df=degrees_of_freedom) return IndependentSampleTTestResult( indep_variable=self._independent_var, dep_variable=self._dependent_var, sample1_level=self._independent_var_levels[0], sample1_mean=sample1_mean, sample1_variance=sample1_variance, sample2_level=self._independent_var_levels[1], sample2_mean=sample2_mean, sample2_variance=sample2_variance, t_value=t_value, p_value=p_value, df=degrees_of_freedom)
def feat_importance_linear(self): linear_list=[] if self._pandas_flag: le=LabelEncoder() X_train = self.data_frame.drop(self.target, axis=1) if self.problem_type !='REGRESSION': try: Y_train=le.fit_transform(self.data_frame[self.target]) except: Y_train = le.fit_transform(self.data_frame[self.target].astype(str)) else: Y_train = self.data_frame[self.target] X_train = X_train[X_train._get_numeric_data().columns] for c in list(X_train.columns): pearson_coef, p_value = stats.pearsonr(X_train[c],Y_train) if p_value < 0.05 : linear_list.append(c) else: if self.problem_type !='REGRESSION': indexer = StringIndexer(inputCol=self.target, outputCol="label") indexed = indexer.fit(self.data_frame).transform(self.data_frame) X_train = indexed.drop('label') num_var = [i[0] for i in X_train.dtypes if ((i[1]=='int') | (i[1]=='double'))] num_of_samples = indexed.select(num_var[0]).count() for column_one in num_var: corr = indexed.corr(column_one, 'label') # num_of_samples = indexed.select(column_one).count() df = num_of_samples - 2 std_error = math.sqrt(old_div((1 - math.pow(corr, 2)), df)) t_value = old_div(corr, std_error) p_value = Stats.t_distribution_critical_value(t_value, df=df) if p_value < 0.05 : linear_list.append(column_one) else: X_train = self.data_frame.drop(self.target) num_var = [i[0] for i in X_train.dtypes if ((i[1]=='int') | (i[1]=='double'))] for column_one in num_var: corr = self.data_frame.corr(column_one, self.target) num_of_samples = self.data_frame.select(column_one).count() df = num_of_samples - 2 std_error = math.sqrt(old_div((1 - math.pow(corr, 2)), df)) t_value = old_div(corr, std_error) p_value = Stats.t_distribution_critical_value(t_value, df=df) if p_value < 0.05 : linear_list.append(column_one) self.data_change_dict['SelectedColsLinear'] = linear_list linear_list.append(self.target) return linear_list
def _corr(self, column_one, column_two): """ Finds correlation between two columns, also calculates a) statistical significance info b) effect size info - coefficient of determination, and c) confidence intervals. :param column_one: :param column_two: :return: """ corr = self._data_frame.corr(column_one, column_two) num_of_samples = self._data_frame.select(column_one).count() df = num_of_samples - 2 std_error = math.sqrt((1 - math.pow(corr, 2)) / df) t_value = corr / std_error p_value = Stats.t_distribution_critical_value(t_value, df=df) coeff_determination = math.pow(corr, 2) corr_stats = CorrelationStats(correlation=corr, std_error=std_error, t_value=t_value, p_value=p_value, degrees_of_freedom=df, coeff_determination=coeff_determination) for alpha in ALPHA_LEVELS: (lower_bound, upper_bound) = self._confidence_interval(corr, num_of_samples, alpha) corr_stats.set_confidence_interval(alpha, lower_bound, upper_bound) return corr_stats
def test(self): column1 = FN.col(self._column1) column2 = FN.col(self._column2) diff_column_name = 'diff' diff_expr = (column2 - column1).alias(diff_column_name) sample_of_differences = self._data_frame.select(diff_expr) sample_size = sample_of_differences.count() sample_mean = Stats.mean(sample_of_differences, diff_column_name) sample_sd = Stats.standard_deviation(sample_of_differences, diff_column_name) t_value = float(sample_mean) / (old_div(sample_sd, math.sqrt(sample_size))) degree_of_freedom = sample_size - 1 p_value = Stats.t_distribution_critical_value(t_value, df=degree_of_freedom) return DependentSampleTtestResult(column1=self._column1, column2=self._column2, sample_size=sample_size, mean_of_differences=sample_mean, df=degree_of_freedom, t_value=t_value, p_value=p_value)
def get_measure_column_splits(self, df, colname, n_split=5): """ n_split = number of splits required -1 splits = [0.0, 23.0, 46.0, 69.0, 92.0, 115.0] splits_range = [(0.0, 23.0), (23.0, 46.0), (46.0, 69.0), (69.0, 92.0), (92.0, 115.0)] """ n_split = 5 minimum_val = Stats.min(df, colname) maximum_val = Stats.max(df, colname) splits = CommonUtils.frange(minimum_val, maximum_val, num_steps=n_split) splits = sorted(splits) splits_range = [(splits[idx], splits[idx + 1]) for idx in range(len(splits) - 1)] output = {"splits": splits, "splits_range": splits_range} return output
def generateClusterDataDict(self, measure_column, kmeans_result): kmeans_stats = kmeans_result["stats"] input_columns = kmeans_stats["inputCols"] kmeans_df = kmeans_result["data"] cluster_data_dict = {"chart_data": None, "grp_data": None} grp_df = kmeans_df.groupBy("prediction").count().toPandas() grp_counts = list(zip(grp_df["prediction"], grp_df["count"])) grp_counts = sorted(grp_counts, key=lambda x: x[1], reverse=True) grp_dict = dict(grp_counts) colors = ["red", "blue", "green", "yellow", "black"] cluster_ids = list(grp_df["prediction"]) color_dict = dict(list(zip(cluster_ids, colors[:len(cluster_ids)]))) chart_data = {"heading": "", "data": []} result_col_data = [self._result_column] measure_col_data = [measure_column] color_data = ["Colors"] plot_labels = ["Cluster Labels"] grp_data = [] total = float(sum(grp_dict.values())) for grp_id in list(grp_df["prediction"]): data = {} data["group_number"] = grp_id + 1 data["count"] = grp_dict[grp_id] data["contribution"] = round( old_div(grp_dict[grp_id] * 100, total), 2) df = kmeans_df.filter(FN.col("prediction") == grp_id) data["columns"] = dict( list(zip(input_columns, [{}] * len(input_columns)))) for val in input_columns: data["columns"][val]["avg"] = round(Stats.mean(df, val), 2) grp_data.append(data) # preparing chart data grp_result_data = [ x[0] for x in df.select(self._result_column).collect() ] result_col_data += grp_result_data grp_measure_data = [ x[0] for x in df.select(measure_column).collect() ] measure_col_data += grp_measure_data color_list = [color_dict[grp_id]] * len(grp_measure_data) color_data += color_list label_list = ["Cluster " + str(int(grp_id))] plot_labels += label_list grp_data = sorted(grp_data, key=lambda x: x["contribution"], reverse=True) chart_data = [ measure_col_data, result_col_data, color_data, plot_labels ] cluster_data_dict["grp_data"] = grp_data cluster_data_dict["chart_data"] = chart_data return cluster_data_dict
def _confidence_interval(self, correlation, num_samples, alpha): """ Finds confidence interval for correlation at given alpha level Ref: http://www2.sas.com/proceedings/sugi31/170-31.pdf :param correlation: :param num_samples: :param alpha: :return: tuple (lower_bound, upper_bound) """ normalized_correlation = 0.5 * math.log(float(1 + correlation) / (1 - correlation)) std_dev = math.sqrt(1.0 / (num_samples - 3)) normalized_lowerbound = normalized_correlation - Stats.normal_distribution_percentile_point_function( alpha) * std_dev normalized_upperbound = normalized_correlation + Stats.normal_distribution_percentile_point_function( alpha) * std_dev return (math.tanh(normalized_lowerbound), math.tanh(normalized_upperbound))
def remove_outliers(self, df, outlier_removal_col): '''Need to check how it will affect multiple columns''' outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z( self._data_frame, outlier_removal_col) df = self._data_frame.filter( self._data_frame[outlier_removal_col] > ol_lower_range) df = self._data_frame.filter( self._data_frame[outlier_removal_col] < ol_upper_range) return df
def run_regression(self, df, measure_column): output = {} result_column = self._result_column result = LinearRegression(df, self._dataframe_helper, self._dataframe_context, self._metaParser, self._spark).fit(result_column) result = { "intercept": result.get_intercept(), "rmse": result.get_root_mean_square_error(), "rsquare": result.get_rsquare(), "coeff": result.get_all_coeff() } if measure_column in result["coeff"].keys(): output["coeff"] = result["coeff"][measure_column]["coefficient"] try: output["elasticity_value"] = output["coeff"] * Stats.mean( df, result_column) / Stats.mean(df, measure_column) except: output["elasticity_value"] = 0 else: output["coeff"] = 0 output["elasticity_value"] = 0 return output
def cap_outliers(self, outlier_replacement_col): outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z( self._data_frame, outlier_replacement_col) df_dup = self._data_frame self._data_frame = df_dup.withColumn( outlier_replacement_col, when((df_dup[outlier_replacement_col] < ol_lower_range), ol_lower_range).otherwise(df_dup[outlier_replacement_col])) self._data_frame = self._data_frame.withColumn( outlier_replacement_col, when((self._data_frame[outlier_replacement_col] > ol_upper_range), ol_upper_range).otherwise( self._data_frame[outlier_replacement_col])) return self._data_frame
def mode_impute_outliers(self, outlier_imputation_col): outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z( self._data_frame, outlier_imputation_col) # df_dup = self._data_frame df_without_outliers = self.remove_outliers(self._data_frame, outlier_imputation_col) mode_without_outliers = self.get_mode( self._data_frame, df_without_outliers[outlier_imputation_col]) self._data_frame = self._data_frame.withColumn( outlier_imputation_col, when((self._data_frame[outlier_imputation_col] < ol_lower_range) | (self._data_frame[outlier_imputation_col] > ol_upper_range), mode_without_outliers).otherwise( self._data_frame[outlier_imputation_col])) return self._data_frame
def mean_impute_outliers(self, outlier_imputation_col): outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z( self._data_frame, outlier_imputation_col) # df_dup = self._data_frame df_without_outliers = self.remove_outliers(self._data_frame, outlier_imputation_col) mean_without_outliers = df_without_outliers.agg( avg(outlier_imputation_col)).first()[0] self._data_frame = self._data_frame.withColumn( outlier_imputation_col, when((self._data_frame[outlier_imputation_col] < ol_lower_range) | (self._data_frame[outlier_imputation_col] > ol_upper_range), mean_without_outliers).otherwise( self._data_frame[outlier_imputation_col])) return self._data_frame
def generate_card4_data(self, col1, col2): #col1 result_column col2 is measure column fs = time.time() data_dict = {} significant_dimensions = self._dataframe_helper.get_significant_dimension( ) print() print("-" * 100) print("Target Column : ", col1) print("Measure Column : ", col2) print("significant_dimensions : ", significant_dimensions) if significant_dimensions != {}: sig_dims = [(x, significant_dimensions[x]) for x in list(significant_dimensions.keys())] sig_dims = sorted(sig_dims, key=lambda x: x[1], reverse=True) cat_columns = [x[0] for x in sig_dims[:10]] else: cat_columns = self._dataframe_helper.get_string_columns()[:10] if not self._pandas_flag: col1_mean = Stats.mean(self._data_frame, col1) col2_mean = Stats.mean(self._data_frame, col2) else: col1_mean = self._data_frame[col1].mean() col2_mean = self._data_frame[col2].mean() print("col1=>", col1, " | col2=>", col2) print(col1_mean, col2_mean) if not self._pandas_flag: low1low2 = self._data_frame.filter( FN.col(col1) < col1_mean).filter(FN.col(col2) < col2_mean) low1high2 = self._data_frame.filter( FN.col(col1) < col1_mean).filter(FN.col(col2) >= col2_mean) high1high2 = self._data_frame.filter( FN.col(col1) >= col1_mean).filter(FN.col(col2) >= col2_mean) high1low2 = self._data_frame.filter( FN.col(col1) >= col1_mean).filter(FN.col(col2) < col2_mean) low1low2Count = low1low2.count() low1high2Count = low1high2.count() high1high2Count = high1high2.count() high1low2Count = high1low2.count() else: low1low2 = self._data_frame[(self._data_frame[col1] < col1_mean) & (self._data_frame[col2] < col2_mean)] low1high2 = self._data_frame[(self._data_frame[col1] < col1_mean) & (self._data_frame[col2] >= col2_mean)] high1high2 = self._data_frame[ (self._data_frame[col1] >= col1_mean) & (self._data_frame[col2] >= col2_mean)] high1low2 = self._data_frame[(self._data_frame[col1] >= col1_mean) & (self._data_frame[col2] < col2_mean)] low1low2Count = low1low2.shape[0] low1high2Count = low1high2.shape[0] high1high2Count = high1high2.shape[0] high1low2Count = high1low2.shape[0] contribution = {} freq = {} elasticity_dict = {} print("low1low2:", low1low2Count) print("low1high2:", low1high2Count) print("high1high2:", high1high2Count) print("high1low2:", high1low2Count) print("quadrant dataframe creation Done in ", time.time() - fs, " seconds.") dfs = [] labels = [] if low1low2Count > 0: fs = time.time() freq["low1low2"] = self.get_freq_dict(low1low2, cat_columns)[:3] print("get_freq_dict Analysis Done in ", time.time() - fs, " seconds.") if not self._pandas_flag: contribution["low1low2"] = str( round( old_div(low1low2Count * 100, self._data_frame.count()))) + "%" else: contribution["low1low2"] = str( round( low1low2Count * 100 / self._data_frame.shape[0])) + "%" fs = time.time() elasticity_dict["low1low2"] = self.run_regression(low1low2, col2) print("run_regression(elasticity) Analysis Done in ", time.time() - fs, " seconds.") dfs.append("low1low2") labels.append("Low %s with Low %s" % (col1, col2)) if low1high2Count > 0: fs = time.time() freq["low1high2"] = self.get_freq_dict(low1high2, cat_columns)[:3] print("get_freq_dict Analysis Done in ", time.time() - fs, " seconds.") if not self._pandas_flag: contribution["low1high2"] = str( round( old_div(low1high2Count * 100, self._data_frame.count()))) + "%" else: contribution["low1high2"] = str( round(low1high2Count * 100 / self._data_frame.shape[0])) + "%" fs = time.time() elasticity_dict["low1high2"] = self.run_regression(low1high2, col2) print("run_regression(elasticity) Analysis Done in ", time.time() - fs, " seconds.") dfs.append("low1high2") labels.append("Low %s with High %s" % (col1, col2)) if high1high2Count > 0: fs = time.time() freq["high1high2"] = self.get_freq_dict(high1high2, cat_columns)[:3] print("get_freq_dict Analysis Done in ", time.time() - fs, " seconds.") if not self._pandas_flag: contribution["high1high2"] = str( round( old_div(high1high2Count * 100, self._data_frame.count()))) + "%" else: contribution["high1high2"] = str( round(high1high2Count * 100 / self._data_frame.shape[0])) + "%" fs = time.time() elasticity_dict["high1high2"] = self.run_regression( high1high2, col2) print("run_regression(elasticity) Analysis Done in ", time.time() - fs, " seconds.") dfs.append("high1high2") labels.append("High %s with High %s" % (col1, col2)) if high1low2Count > 0: fs = time.time() freq["high1low2"] = self.get_freq_dict(high1low2, cat_columns)[:3] print("get_freq_dict Analysis Done in ", time.time() - fs, " seconds.") if not self._pandas_flag: contribution["high1low2"] = str( round( old_div(high1low2Count * 100, self._data_frame.count()))) + "%" else: contribution["high1low2"] = str( round(high1low2Count * 100 / self._data_frame.shape[0])) + "%" fs = time.time() elasticity_dict["high1low2"] = self.run_regression(high1low2, col2) print("run_regression(elasticity) Analysis Done in ", time.time() - fs, " seconds.") dfs.append("high1low2") labels.append("High %s with Low %s" % (col1, col2)) fs = time.time() # overall_coeff = self._regression_result.get_coeff(col2) overall_coeff = self._regression_result.get_all_coeff( )[col2]["coefficient"] if not self._pandas_flag: elasticity_value = old_div( overall_coeff * Stats.mean(self._data_frame, col1), Stats.mean(self._data_frame, col2)) else: elasticity_value = old_div( overall_coeff * self._data_frame[col1].mean(), self._data_frame[col2].mean()) data_dict["overall_elasticity"] = elasticity_value label_dict = dict(list(zip(dfs, labels))) data_dict["measure_column"] = col2 data_dict["result_column"] = col1 data_dict["label_dict"] = label_dict data_dict["elastic_grp_list"] = [] data_dict["inelastic_grp_list"] = [] data_dict["elastic_count"] = 0 data_dict["inelastic_count"] = 0 for val in dfs: elastic_data = elasticity_dict[val] if elastic_data["elasticity_value"] > 1: data_dict["elastic_count"] += 1 data_dict["elastic_grp_list"].append( (label_dict[val], elastic_data["elasticity_value"])) else: data_dict["inelastic_count"] += 1 data_dict["inelastic_grp_list"].append( (label_dict[val], elastic_data["elasticity_value"])) data_dict["freq"] = freq data_dict["contribution"] = contribution data_dict["charts"] = {"heading": "", "data": []} col1_data = [col1] col2_data = [col2] color_data = ["Colors"] plotColors = [] if low1low2Count > 0: sample_rows = min(100.0, float(low1low2Count)) if not self._pandas_flag: low1low2 = low1low2.sample(False, old_div(sample_rows, low1low2Count), seed=50) low1low2_col1 = [x[0] for x in low1low2.select(col1).collect()] low1low2_col2 = [x[0] for x in low1low2.select(col2).collect()] else: low1low2 = low1low2.sample(replace=False, frac=old_div( sample_rows, low1low2Count), random_state=50) low1low2_col1 = low1low2[col1].tolist() low1low2_col2 = low1low2[col2].tolist() low1low2_color = ["#DD2E1F"] * len(low1low2_col2) col1_data += low1low2_col1 col2_data += low1low2_col2 color_data += low1low2_color plotColors.append("#DD2E1F") if low1high2Count > 0: sample_rows = min(100.0, float(low1high2Count)) if not self._pandas_flag: low1high2 = low1high2.sample(False, old_div(sample_rows, low1high2Count), seed=50) low1high2_col1 = [ x[0] for x in low1high2.select(col1).collect() ] low1high2_col2 = [ x[0] for x in low1high2.select(col2).collect() ] else: low1high2 = low1high2.sample(replace=False, frac=old_div( sample_rows, low1high2Count), random_state=50) low1high2_col1 = low1high2[col1].tolist() low1high2_col2 = low1high2[col2].tolist() low1high2_color = ["#7C5BBB"] * len(low1high2_col2) col1_data += low1high2_col1 col2_data += low1high2_col2 color_data += low1high2_color plotColors.append("#7C5BBB") if high1high2Count > 0: sample_rows = min(100.0, float(high1high2Count)) if not self._pandas_flag: high1high2 = high1high2.sample(False, old_div(sample_rows, high1high2Count), seed=50) high1high2_col1 = [ x[0] for x in high1high2.select(col1).collect() ] high1high2_col2 = [ x[0] for x in high1high2.select(col2).collect() ] else: high1high2 = high1high2.sample(replace=False, frac=old_div( sample_rows, high1high2Count), random_state=50) high1high2_col1 = high1high2[col1].tolist() high1high2_col2 = high1high2[col2].tolist() high1high2_color = ["#00AEB3"] * len(high1high2_col2) col1_data += high1high2_col1 col2_data += high1high2_col2 color_data += high1high2_color plotColors.append("#00AEB3") if high1low2Count > 0: sample_rows = min(100.0, float(high1low2Count)) if not self._pandas_flag: high1low2 = high1low2.sample(False, old_div(sample_rows, high1low2Count), seed=50) high1low2_col1 = [ x[0] for x in high1low2.select(col1).collect() ] high1low2_col2 = [ x[0] for x in high1low2.select(col2).collect() ] else: high1low2 = high1low2.sample(replace=False, frac=old_div( sample_rows, high1low2Count), random_state=50) high1low2_col1 = high1low2[col1].tolist() high1low2_col2 = high1low2[col2].tolist() high1low2_color = ["#EC640C"] * len(high1low2_col2) col1_data += high1low2_col1 col2_data += high1low2_col2 color_data += high1low2_color plotColors.append("#EC640C") plot_labels = dict(list(zip(plotColors, labels))) all_data = sorted(zip(col2_data[1:], col1_data[1:], color_data[1:]), key=lambda x: x[1]) scatterData = ScatterChartData() data_obj = dict(list(zip(labels, [[] for i in range(len(labels))]))) for val in all_data: col = val[2] obj = {col1: val[1], col2: val[0]} key = plot_labels[col] data_obj[key].append(obj) scatterData.set_data(data_obj) scatterChart = ChartJson() scatterChart.set_data(scatterData.get_data()) scatterChart.set_legend(plot_labels) scatterChart.set_label_text({"x": col2, "y": col1}) scatterChart.set_axes({"x": col2, "y": col1}) scatterChart.set_chart_type("scatter") data_dict["charts"] = scatterChart print("dsa Analysis Done in ", time.time() - fs, " seconds.") return data_dict