def post(self): """ Returns summary description of the numeric variables in the dataset """ file_name = request.get_json()['sessionId'] data = file_service.read_file(file_name) return data.describe().to_json()
def get(self, datetime, file_name, x, y, hue, reg): """ Returns a scatter plot for the two provided x and y numeric variables and an optional hue for a categorical variable. :param datetime: the current datetime in integer format :param file_name: the file name represented by session_id :param x: the x variable :param y: the y variable :param hue: the categorical hue for the plot (optional) :param reg: whether to display as a regression plot :return: A bytes image seaborn scatter plot """ columns = [x, y] if hue == 'none' else [x, y, hue] data = file_service.read_file(file_name) if len(data.index) > 15000: data = data.sample(n=15000) plot = plot_service.get_scatter_plot(data, hue, x, y, reg) plt.tight_layout() bytes_image = io.BytesIO() plt.savefig(bytes_image, format='png') bytes_image.seek(0) plt.close() return send_file(bytes_image, attachment_filename=f"scatter.png", mimetype='image/png')
def get(self, datetime, file_name, columns): """ Returns a correlation heatmap visualisation for the dataset """ data = file_service.read_file(file_name) f, ax = plt.subplots(figsize=(16, 9)) correlation_df = data[columns.split(',')].select_dtypes( include=[np.number]).corr(method='pearson') cmap = sns.diverging_palette(220, 10, as_cmap=True) mask = np.zeros_like(correlation_df, dtype=np.bool) mask[np.triu_indices_from(mask)] = True correlation_plot = sns.heatmap(correlation_df, mask=mask, annot=True, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}) plt.tight_layout() bytes_image = io.BytesIO() plt.savefig(bytes_image, format='png') bytes_image.seek(0) plt.close() return send_file(bytes_image, attachment_filename=f"correlation-plot.png", mimetype='image/png')
def post(self): """ Returns shape of the data """ file_name = request.get_json()['sessionId'] data = file_service.read_file(file_name) shape = data.shape return json.dumps({'rows': shape[0], 'columns': shape[1]})
def post(self): """ Returns summary description of the categorical variables in the dataset """ file_name = request.get_json()['sessionId'] data = file_service.read_file(file_name) categorical_df = data.select_dtypes( include=['object', 'bool']) return categorical_df.describe().to_json()
def post(self, column_name): """ Returns the unique values for the given column """ file_name = request.get_json()['sessionId'] data = file_service.read_file(file_name) data[column_name].fillna('Missing', inplace=True) uniques = pd.unique(data[column_name]) uniques_json = json.dumps({ 'uniques': np.sort(uniques).tolist() }) return uniques_json
def get(self, datetime, file_name): """ Returns a CSV with all missing data imputed """ data = file_service.read_file(file_name) data.fillna(data.median(), inplace=True) data.fillna(value='missing', inplace=True) resp = make_response(data.to_csv(index=False)) resp.headers["Content-Disposition"] = "attachment; filename=imputed_missing.csv" resp.headers["Content-Type"] = "text/csv" return resp
def post(self): """ Returns the labels of the columns for both numeric and categorical variables """ file_name = request.get_json()['sessionId'] data = file_service.read_file(file_name) numeric_df = data.select_dtypes(include=[np.number]) categorical_df = data.select_dtypes(include=['object', 'bool']) return json.dumps({ 'categorical': list(categorical_df.columns.values), 'numeric': list(numeric_df.columns.values) })
def post(self, x, y, show_as_percentages=False): """ Returns a json object containing a cross tabulation (pivot table) data structure for x and y variables """ file_name = request.get_json()['sessionId'] data = file_service.read_file(file_name) if (show_as_percentages): crosstab = pd.crosstab( data[x], data[y], normalize='index').round(4) * 100 else: crosstab = pd.crosstab(data[x], data[y]) return { 'crosstab': crosstab.to_json(), 'transposed': crosstab.transpose().to_json() }
def get(self, datetime, file_name, x, target_column, target_value, analysis_type, is_actuals): """ Returns a bar plot showing the percentages of each categorical features in the chosen column against the target variable. :param datetime: the current datetime in integer format :param file_name: the file name represented by session_id :param x: the x variable :param target_column: the target variable of the analysis :param target_value: the target value of the analysis :param analysis_type: a flag of 'categorical' or 'continuous' analysis type :param is_actuals: a flag where 1 is actual counts and 0 is percentages """ data = file_service.read_file(file_name) data = influencers_service.bin_continuous_cols(data, target_column) f, ax = plt.subplots(figsize=(15, 11)) if analysis_type != 'continuous': target_value = plot_service.convert_continuous_target_dtype( data, target_column, target_value ) filtered_df = data[data[target_column] == target_value] plot = plot_service.get_influencers_plot( data, filtered_df, x, target_column, analysis_type, is_actuals) if (is_actuals == 1): base_count = len(data[data[target_column] == target_value]) else: if (is_numeric_dtype(data[target_column])): base_mean = data[target_column].mean() plot.axhline(base_mean, ls='--', label=f'Average {target_column}') plt.legend() plt.tight_layout() bytes_image = io.BytesIO() plt.savefig(bytes_image, format='png') bytes_image.seek(0) plt.close() return send_file(bytes_image, attachment_filename=f"influencers.png", mimetype='image/png')
def get(self, datetime, file_name): """ Returns a standard pairplot for the dataset """ data = file_service.read_file(file_name) f, ax = plt.subplots(figsize=(16, 9)) pairplot = sns.pairplot(data) plt.tight_layout() bytes_image = io.BytesIO() plt.savefig(bytes_image, format='png') bytes_image.seek(0) plt.close() return send_file(bytes_image, attachment_filename=f"pairplot.png", mimetype='image/png')
def get(self, variable, file_name, datetime): """ Returns distribution plot for numeric variables and a countplot for categorical variables. """ data = file_service.read_file(file_name) f, ax = plt.subplots(figsize=(11, 9)) plot = plot_service.get_distribution_plot(data, variable) plt.tight_layout() bytes_image = io.BytesIO() plt.savefig(bytes_image, format='png') bytes_image.seek(0) plt.close() return send_file(bytes_image, attachment_filename=f"{variable}_distribution.png", mimetype='image/png')
def get(self, datetime, file_name): """ Returns a missing data visualisation for the dataset """ data = file_service.read_file(file_name) f, ax = plt.subplots(figsize=(16, 9)) missing_plot = sns.heatmap( data.isnull(), cbar=False, cmap="YlGnBu_r") plt.tight_layout() bytes_image = io.BytesIO() plt.savefig(bytes_image, format='png') bytes_image.seek(0) plt.close() return send_file(bytes_image, attachment_filename=f"missing-data.png", mimetype='image/png')
def post(self): """ Returns a json object with information (metadata) on the dataset """ file_name = request.get_json()['sessionId'] data = file_service.read_file(file_name) total_missing_values, total_missing_percent = data_service.get_missing_values_info( data) total_duplicate_rows, total_duplicates_percent = data_service.get_duplicates_info( data) return json.dumps({ 'totalMissingValues': int(total_missing_values), 'totalMissingPercent': total_missing_percent, 'totalKbInMemory': data_service.get_total_kilobytes_in_memory(data), 'variables': int(data.shape[1]), 'observations': int(data.shape[0]), 'totalDuplicatedRows': int(total_duplicate_rows), 'totalDuplicatedPercent': total_duplicates_percent, 'columnsInfo': data_service.get_column_type_counts(data), 'warningMessages': data_service.populate_warning_messages(data) })
def post(self, analysis_type, target_column, target_value): """ Returns key influencers json """ file_name = request.get_json()['sessionId'] data = file_service.read_file(file_name) target_dtype = data[target_column].dtypes if analysis_type != 'continuous': if target_dtype == 'float64': target_value = float(target_value) elif target_dtype == 'int64': target_value = int(target_value) else: target_value = str(target_value) method = 'classification' if analysis_type == 'categorical' else 'regression' influencers = influencers_service.find_key_influencers(target_column, target_value, df=data, method=method) return json.dumps({'influencers': influencers})
def test_read_file(client): # act df = file_service.read_file(FILE_NAME) # assert assert df.shape == (891, 12)