def get_source_target_dataframe(form): source_file = form.cleaned_data["source_file"] target_file = form.cleaned_data["target_file"] df_source = DataFrameUtil.file_to_dataframe(source_file, header=0) df_target = DataFrameUtil.file_to_dataframe(target_file, header=0) # source_column_header = form.cleaned_data['source_column_header'] # target_column_header = form.cleaned_data['target_column_header'] # # df_source = pd.DataFrame() # Source file # df_target = pd.DataFrame() # Target file # # if source_file: # # Check if data contains header # source_column_header_idx = None # if source_column_header == "on": # source_column_header_idx = 0 # # df_source = DataFrameUtil.file_to_dataframe(source_file, header=source_column_header_idx) # # if target_file: # # Check if data contains header # target_column_header_idx = None # if target_column_header == "on": # target_column_header_idx = 0 # # df_target = DataFrameUtil.file_to_dataframe(target_file, header=target_column_header_idx) # return df_source, df_target
def lda_plot(request): """ Display home page of PCA """ form = LdaPlotForm(request.POST, request.FILES) resp_data = dict() # PCA 3D plot = dict() if form.is_valid(): # Get input files data_file = form.cleaned_data["data_file"] label_file = form.cleaned_data["label_file"] df_input = DataFrameUtil.file_to_dataframe(data_file, header=None) df_label = DataFrameUtil.file_to_dataframe(label_file, header=None) clf = LinearDiscriminantAnalysis(n_components=3) X = df_input.values y = df_label.values clf.fit_transform(X, y) plot['x'] = list(X[:, 0]) plot['y'] = list(X[:, 1]) plot['z'] = list(X[:, 2]) resp_data['plot'] = plot else: resp_data[msg.ERROR] = escape(form._errors) return JsonResponse(resp_data)
def extract_matched_key(key_file, data_file): # Process matching between keys from both file and write a new file for result. df_keys = DataFrameUtil.file_to_dataframe(key_file, header=None) df_data = DataFrameUtil.file_to_dataframe(data_file, header=None) # select data from df_data where the first column (keys) exist in df_keys keys = list(df_keys.iloc[:, 0].values) # print("Key", keys) # print("df data\n", df_data.iloc[:, 0]) # print("df data\n", df_data.iloc[:, 1]) df_result = df_data[ df_data.iloc[:, 0].isin(keys)] # print("Result", df_result) return df_result
def upload_file_handler(request): if(request.method == 'POST'): # upload file form = UploadFileForm(request.POST, request.FILES) if form.is_valid(): data_file = request.FILES['data_file'] column_header = form.cleaned_data['column_header'] # filename = fs.save_file(data_file) column_header_idx = None if column_header == "on": column_header_idx = 0 df = DataFrameUtil.file_to_dataframe(data_file, header=column_header_idx) # file_json_data, columns_value = DataFrameUtil.convert_csv_to_json(file_full_path, header_row=column_header_idx, orient='values') # values, records # analyze_results = analyze_data(file_full_path) analyze_results = DataFrameUtil.analyze_dataframe(df) file_json_data, columns_name = DataFrameUtil.dataframe_to_json(df) resp_data = { # msg.SUCCESS:'The file has been uploaded successfully.', \ 'table_data': file_json_data, \ 'table_columns': columns_name, \ 'analysis': analyze_results} return JsonResponse(resp_data) else: # Form validation error resp_data = {msg.ERROR: escape(form._errors)} return JsonResponse(resp_data) else: resp_data = {msg.ERROR: "request is not POST."} return JsonResponse(resp_data)
def elbow_plot_handler(request): form = PcaPlotForm(request.POST, request.FILES) resp_data = dict(); if form.is_valid(): # Get input files data_file = form.cleaned_data["data_file"] df_input = DataFrameUtil.file_to_dataframe(data_file, header=None) X_scaled = PreProcessingUtil.standardize(df_input) # Get explain variance ratio pca_helper = PcaUtil() pca = pca_helper.get_fit_transfrom_pca(X_scaled) arr_variance_ratio = pca.explained_variance_ratio_ # Prepare all tabs to display Plot, Table by Bokeh # Add ratio to bokeh line graph elbow_plot = draw_elbow_plot(arr_variance_ratio) # Add line to a panel tab1 = Panel(child=elbow_plot, title="Elbow Curve Plot") # tab2 = Panel(child=df_describe_table, title="Data Description") # Add a panel to tab tabs = Tabs(tabs=[ tab1 ]) script, div = components(tabs) plots = { 'script': script, 'div': div} resp_data["bokeh_plot"] = plots else: resp_data[msg.ERROR] = escape(form._errors) return JsonResponse(resp_data)
def save_data_handler(request): """ Clean up data """ form = SaveFileForm(request.POST, request.FILES) if form.is_valid(): file = request.FILES["data_file"] choice_cleanup = form.cleaned_data["choice_cleanup"] column_header = form.cleaned_data["column_header"] exclude_columns = form.cleaned_data["exclude_columns"] remain_columns = form.cleaned_data["remain_columns"] split_row_from = form.cleaned_data["split_row_from"] split_row_to = form.cleaned_data["split_row_to"] save_as_name = form.cleaned_data["save_as_name"] if save_as_name: # When column header is check, set to row 0 (zero based index) column_header_idx = None if column_header == "on": column_header_idx = 0 # df = read_file_to_dataframe(file_name, column_header_idx) df = DataFrameUtil.file_to_dataframe(file, header=column_header_idx) # Split row from - to if split_row_from and split_row_from: # To zero based index. split_row_from_idx = int(split_row_from) - 1 split_row_to_idx = int(split_row_to) df = df.iloc[split_row_from_idx:split_row_to_idx, :] # Delete NaN row if choice_cleanup == "delete": df = DataFrameUtil.drop_na_row(df) # Drop columns and store to new df. if exclude_columns: df = dataframe_exclude_columns(df, exclude_columns) if remain_columns: df = dataframe_remain_columns(df, remain_columns) # Don't forget to add '.csv' at the end of the path header = False if column_header_idx != None: header = True df.to_csv(fs.get_base_location() + save_as_name, index=None, header=header) columns_value = df.columns.tolist() file_json_data = df.to_json(orient='values') analyze_results = DataFrameUtil.analyze_dataframe(df) resp_data = {msg.SUCCESS:'The file has been save as ' + save_as_name, \ 'table_data': file_json_data, \ 'table_columns': columns_value, \ 'analysis': analyze_results} else: resp_data = {msg.ERROR:'[ERROR] Invalid parameter.'} return JsonResponse(resp_data)
def pca_plot(request): """ Display home page of PCA """ form = PcaPlotForm(request.POST, request.FILES) resp_data = dict(); # PCA 3D plot = dict() if form.is_valid(): # Get input files data_file = form.cleaned_data["data_file"] df_input = DataFrameUtil.file_to_dataframe(data_file, header=None) X, pca = PcaUtil.reduce_dimension(df_input, n_components=3) plot['x'] = list(X[:, 0]) plot['y'] = list(X[:, 1]) plot['z'] = list(X[:, 2]) resp_data['plot'] = plot # print(resp_data) else: resp_data[msg.ERROR] = escape(form._errors) return JsonResponse(resp_data)
def process_clean_up_data_handler(request): """ Clean up data by removing NaN rows, drop columns """ form = ProcessFileForm(request.POST, request.FILES) if form.is_valid(): file_name = request.FILES["data_file"] choice_cleanup = form.cleaned_data["choice_cleanup"] column_header = form.cleaned_data["column_header"] exclude_columns = form.cleaned_data["exclude_columns"] remain_columns = form.cleaned_data["remain_columns"] split_row_from = form.cleaned_data["split_row_from"] split_row_to = form.cleaned_data["split_row_to"] df = None if file_name: # When column header is check, set to row 0 (zero based index) column_header_idx = None if column_header == "on": column_header_idx = 0 df = DataFrameUtil.file_to_dataframe(file_name, header=column_header_idx) # df = read_file_to_dataframe(file_name, column_header_idx) # Split row from - to if split_row_from and split_row_from: # To zero based index. split_row_from_idx = split_row_from - 1 split_row_to_idx = split_row_to df = df.iloc[split_row_from_idx:split_row_to_idx, :] # TODO file with mean, median # Delete NaN row if choice_cleanup == "delete": df = DataFrameUtil.drop_na_row(df) # Drop columns and store to new df. if exclude_columns: df = dataframe_exclude_columns(df, exclude_columns) # Drop other columns except those specified by user. if remain_columns: df = dataframe_remain_columns(df, remain_columns) file_json_data = df.to_json(orient='values') columns_value = df.columns.tolist() analyze_results = DataFrameUtil.analyze_dataframe(df) resp_data = { # msg.SUCCESS:'The file has been uploaded successfully.', \ 'table_data': file_json_data, \ 'table_columns': columns_value, \ 'analysis': analyze_results} else: resp_data = {msg.ERROR:'[ERROR] Invalid request parameters.'} else: # Form validation error resp_data = {msg.ERROR: escape(form._errors)} return JsonResponse(resp_data) return JsonResponse(resp_data)
def process_data_handler(request): """ Process uploaded data to find 3 features that most relevance to clinical outcomes Result returned in JSON format as following: - plot: {data: {x: .., y:.., z: ..., label: ..., column_names: []}} - msg_info|msg_error|msg_success|msg_warning| : .... data_tables: {table1: { table_columns: [..,..] , table_data: [[..]], point_id: [...]}, table2: {...}} """ form = DataFileInputForm(request.POST, request.FILES) resp_data = dict() # 3D most importance features plot = dict() # Plot Feature ranking plot_feature_ranking = dict() data_tables = dict() if form.is_valid(): # Get input files data_file = form.cleaned_data["data_file"] output_file = form.cleaned_data["output_file"] data_column_header = form.cleaned_data['data_column_header'] output_column_header = form.cleaned_data['output_column_header'] # print(data_column_header, output_column_header) # Declare empty dataframe to store uploaded data. df_data = pd.DataFrame() df_output = pd.DataFrame() # Convert files to dataframe # Check if data contain table header or not. # Then select data with/without table header to generate dataframe. # Check if both required input files are valid. if data_file and output_file: # Convert radiomic data to dataframe data_column_header_idx = None if data_column_header == "on": data_column_header_idx = 0 df_data = DataFrameUtil.file_to_dataframe( data_file, header=data_column_header_idx) if data_column_header_idx == None: # generate from 0 to len gen_cols = np.arange(0, df_data.shape[1]).astype(str) df_data.columns = gen_cols # Convert clinical outcomes data to dataframe output_column_header_idx = None if output_column_header == "on": output_column_header_idx = 0 if output_column_header_idx == None: # generate from 0 to len gen_cols_output = np.arange(0, df_output.shape[1]).astype(str) df_output.columns = gen_cols_output df_output = DataFrameUtil.file_to_dataframe( output_file, header=output_column_header_idx) # Apply feature selection model to select most 2 or 3 relevant features with clinical outcomes X_selected, arr_sorted_columns, arr_sorted_importance, arr_cate_columns = feature_selection_random_forest_regressor( df_data, df_output) # Prepare result for plotting 3D and grid tables for uploaded data # e.g. plot - selected feature, grids - radiomic, outcomes # Generate unique id for each row since it is required for slickgrid # TODO change unique_ids to patient ID or etc (confirm with Carlos) unique_ids = np.arange(0, df_data.shape[0]) if df_data.shape[1] > 2: space_col_names = ['x', 'y', 'z'] else: space_col_names = ['x', 'y'] plot_data = pd.DataFrame(data=X_selected.values, columns=space_col_names) plot_data['label'] = unique_ids plot['column_names'] = list(X_selected.columns.values) # Feature ranking plot_feature_ranking['column_names'] = arr_sorted_columns plot_feature_ranking['importances'] = arr_sorted_importance # Data table plot["data"] = plot_data.to_json() # Add column 'id' for slickgrid df_data.insert(loc=0, column='id', value=unique_ids) data_tables['table1'] = { 'table_data': df_data.to_json(orient='records'), \ 'column_names': list(df_data.columns.values), \ 'point_id': str(unique_ids)} # Original outcomes column names are used for generating group of colorscale button in UI part. # original_outcomes_columns = df_output.columns.value df_output.insert(loc=0, column='id', value=unique_ids) data_tables['table2'] = { 'table_data': df_output.to_json(orient='records'), \ 'column_names': list(df_output.columns.values), \ 'point_id': str(unique_ids), # not used in frontend 'cate_columns': arr_cate_columns} # Prepare response data resp_data['plot'] = plot resp_data['plot_feature_ranking'] = plot_feature_ranking resp_data['data_tables'] = data_tables else: resp_data[msg.ERROR] = escape(form._errors) return JsonResponse(resp_data)
def process_data_handler(request): """ Get data for analysis and general information Result format plot: {original_data: {x: .., y:.., label: ...}, new_data: {x:..., y:..., label:...} data_table: {table_columns: ..., table_data: ...}} msg_info|msg_error|msg_success|msg_warning| : .... data_tables: {table1: { table_columns: [..,..] , table_data: [[..]], point_id: [...]}, table2: {...}} """ form = VisInputForm(request.POST, request.FILES) resp_data = dict() plot = dict() data_tables = dict() if form.is_valid(): data_file = form.cleaned_data["data_file"] label_file = form.cleaned_data["label_file"] add_data_file = form.cleaned_data["add_data_file"] predict_data_file = form.cleaned_data["new_data_file"] general_data_file = form.cleaned_data["general_data_file"] data_column_header = form.cleaned_data['data_column_header'] add_data_column_header = form.cleaned_data['add_data_column_header'] label_column_header = form.cleaned_data['label_column_header'] new_data_column_header = form.cleaned_data['new_data_column_header'] general_data_column_header = form.cleaned_data[ 'general_data_column_header'] df_data = pd.DataFrame() # Original data space df_label = pd.DataFrame() # Label of original data df_add_data = pd.DataFrame() # Additional data for base space df_new_data = pd.DataFrame() # New data to predict df_general_info = pd.DataFrame() # General info # Check if data contain table header or not. # Then select data with/without table header to generate dataframe. df_X_ori2d = None data_column_header_idx = None if data_file: if data_column_header == "on": data_column_header_idx = 0 df_data = DataFrameUtil.file_to_dataframe( data_file, header=data_column_header_idx) # Reduce dimension for visualization X_scaled = PreProcessingUtil.fit_transform(df_data) X_ori2d, pca = PcaUtil.reduce_dimension(X_scaled, n_components=2) # print(X_ori2d) # Convert result to resulting dataframe df_plot_original = pd.DataFrame(data=X_ori2d, columns=['x', 'y']) df_y_ori = None if label_file: label_column_header_idx = None if label_column_header == "on": label_column_header_idx = 0 df_label = DataFrameUtil.file_to_dataframe( label_file, header=label_column_header_idx) # df_y_ori = pd.DataFrame(data=df_label.values, columns=['label']) # Process additional data for data table df_add_data_id = pd.DataFrame() # For unique ID to add to data point if add_data_file: add_data_column_header_idx = None if add_data_column_header == "on": add_data_column_header_idx = 0 df_add_data = DataFrameUtil.file_to_dataframe( add_data_file, header=add_data_column_header_idx) df_add_data_id = df_add_data.iloc[:, 0] # Join base space X, y ==> label, x coordinate, y coordinate df_plot_original['label'] = df_label # Optional: Add unique key to data point if not df_add_data_id.empty: # Join id at the first column to format of: point_id, label, x, y # df_add_data_id = pd.DataFrame(data=df_add_data_id.values, columns=['point_id']) df_plot_original['point_id'] = df_add_data_id.values # df_plot_original = df_add_data_id.join(df_plot_original) # point_id, label, x, y plot["original_data"] = df_plot_original.to_json() # For SlickGrid format plot["original_data_split"] = df_plot_original.to_json( orient='columns') # ========== End of processing original data for data point ====== # Convert additional data to dataframe --> json response df_plot_predict = pd.DataFrame() # If new data file is uploaded, predict the data and add to plot if predict_data_file: new_column_header_idx = None if label_column_header == "on": label_column_header_idx = 0 df_new_data = DataFrameUtil.file_to_dataframe( predict_data_file, header=new_column_header_idx) # Process data with pipeline of selected algorithm X_new_scaled, y_predict = predict_new_data(df_new_data) X_new2d, new_pca = PcaUtil.reduce_dimension(X_new_scaled, n_components=2) df_plot_predict = pd.DataFrame(data=X_new2d, columns=['x', 'y']) df_plot_predict['label'] = y_predict # If additional info for predict data is uploaded, get ID from the file plot['new_data'] = df_plot_predict.to_json() # If additional info for predicting data is uploaded # Update new_data with point_id to get data in format of # point_id, label, x, y df_predict_data_info = pd.DataFrame() df_predict_data_id = pd.DataFrame() if general_data_file: general_data_column_header_idx = None if general_data_column_header == "on": general_data_column_header_idx = 0 df_predict_data_info = DataFrameUtil.file_to_dataframe( general_data_file, header=general_data_column_header_idx) # Optional: Add unique key to data point # Join id at the first column to point_id, label, x, y # df_predict_data_id = pd.DataFrame(data=df_predict_data_info.iloc[:, 0].values, columns=['point_id']) # df_plot_predict = df_predict_data_id.join(df_plot_predict) df_plot_predict[ 'point_id'] = data = df_predict_data_info.iloc[:, 0].values plot['new_data'] = df_plot_predict.to_json() # =========== End of Processing Predict Data ========= if not df_predict_data_info.empty: # append general info of new data to based space df_add_data = df_add_data.append(df_predict_data_info) # Prepare data for visualize resp_data['plot'] = plot # id for slickgrid (required) if not df_add_data_id.empty: df_data.insert(loc=0, column='id', value=df_add_data_id.values) else: df_data.insert(loc=0, column='id', value=np.arange(0, df_data.shape[0])) data_tables['table1'] = { 'table_data': df_data.to_json(orient='records'), \ 'point_id': str(list(df_data['id'].values))} if not df_add_data.empty: # For SlickGrid use orient='records' # Format point_id: [{..}, {..}] df_add_data['id'] = df_add_data.iloc[:, 0].values # Slickgrid does not support column with dot like "f.eid" df_add_data.rename(columns={'f.eid': 'f:eid'}, inplace=True) data_tables['table2'] = { 'table_data': df_add_data.to_json(orient='records'), \ 'point_id': df_add_data.iloc[:, 0].to_json(orient='values')} # TypeError: Object of type 'int64' is not JSON serializable # Then cast to str resp_data['height_min'] = str(df_add_data['height'].min()) resp_data['height_max'] = str(df_add_data['height'].max()) resp_data['weight_min'] = str(df_add_data['weight'].min()) resp_data['weight_max'] = str(df_add_data['weight'].max()) resp_data['age_min'] = str(df_add_data['age'].min()) resp_data['age_max'] = str(df_add_data['age'].max()) resp_data['data_tables'] = data_tables else: resp_data[msg.ERROR] = escape(form._errors) return JsonResponse(resp_data)