def store_table_in_db(data_frame, pk, table_name, temporary=False): """ Update or create a table in the DB with the data in the data frame. It also updates the corresponding column information :param data_frame: Data frame to dump to DB :param pk: Corresponding primary key of the workflow :param table_name: Table to use in the DB :param temporary: Boolean stating if the table is temporary, or it belongs to an existing workflow. :return: If temporary = True, then return a list with three lists: - column names - column types - column is unique If temporary = False, return None. All this info is stored in the workflow """ if settings.DEBUG: print('Storing table ', table_name) # get column names df_column_names = list(data_frame.columns) # if the data frame is temporary, the procedure is much simpler if temporary: # Get the if the columns have unique values per row column_unique = are_unique_columns(data_frame) # Store the table in the DB store_table(data_frame, table_name) # Get the column types df_column_types = df_column_types_rename(table_name) # Return a list with three list with information about the # data frame that will be needed in the next steps return [df_column_names, df_column_types, column_unique] # We are modifying an existing DF # Get the workflow and its columns workflow = Workflow.objects.get(id=pk) wf_col_names = Column.objects.filter(workflow__id=pk).values_list( "name", flat=True) # Loop over the columns in the data frame and reconcile the column info # with the column objects attached to the WF for cname in df_column_names: # See if this is a new column if cname in wf_col_names: # If column already exists in wf_col_names, no need to do anything continue # Create the new column column = Column( name=cname, workflow=workflow, data_type=pandas_datatype_names[data_frame[cname].dtype.name], is_key=is_unique_column(data_frame[cname]), position=Column.objects.filter(workflow=workflow).count() + 1, ) column.save() # Get now the new set of columns with names wf_columns = Column.objects.filter(workflow__id=pk) # Reorder the columns in the data frame data_frame = data_frame[[x.name for x in wf_columns]] # Store the table in the DB store_table(data_frame, table_name) # Review the column types because some "objects" are stored as booleans column_types = df_column_types_rename(table_name) for ctype, col in zip(column_types, wf_columns): if col.data_type != ctype: # If the column type in the DB is different from the one in the # object, update col.data_type = ctype col.save() # Update workflow fields and save workflow.nrows = data_frame.shape[0] workflow.ncols = data_frame.shape[1] workflow.set_query_builder_ops() workflow.data_frame_table_name = table_name workflow.save() return None
def run(request, pk): """ View provided as the first step to execute a plugin. :param request: HTTP request received :param pk: primary key of the plugin :return: Page offering to select the columns to invoke """ # Get the workflow and the plugin information workflow = get_workflow(request) if not workflow: return redirect('workflow:index') try: plugin_info = PluginRegistry.objects.get(pk=pk) except PluginRegistry.DoesNotExist: return redirect('workflow:index') plugin_instance, msgs = load_plugin(plugin_info.filename) if plugin_instance is None: messages.error( request, 'Unable to instantiate plugin "{0}"'.format(plugin_info.name)) return redirect('dataops:transform') if len(plugin_instance.input_column_names) > 0: # The plug in works with a fixed set of columns cnames = workflow.columns.all().values_list('name', flat=True) if not set(plugin_instance.input_column_names) < set(cnames): # The set of columns are not part of the workflow messages.error( request, 'Workflow does not have the correct columns to run this plugin' ) return redirect('dataops:transform') # create the form to select the columns and the corresponding dictionary form = SelectColumnForm(request.POST or None, workflow=workflow, plugin_instance=plugin_instance) # Set the basic elements in the context context = { 'form': form, 'output_column_fields': [x for x in list(form) if x.name.startswith(field_prefix + 'output')], 'parameters': [ x for x in list(form) if x.name.startswith(field_prefix + 'parameter') ], 'pinstance': plugin_instance, 'id': workflow.id } # If it is a GET request or non valid, render the form. if request.method == 'GET' or not form.is_valid(): return render(request, 'dataops/plugin_info_for_run.html', context) # POST is correct proceed with execution # Get the data frame and select the appropriate columns try: dst_df = pandas_db.load_from_db(workflow.id) except Exception: messages.error(request, 'Exception while retrieving the data frame') return render(request, 'error.html', {}) # Take the list of inputs from the form if empty list is given. if not plugin_instance.input_column_names: plugin_instance.input_column_names = \ [c.name for c in form.cleaned_data['columns']] # Get the proper subset of the data frame sub_df = dst_df[[form.cleaned_data['merge_key']] + plugin_instance.input_column_names] # Process the output columns for idx, output_cname in enumerate(plugin_instance.output_column_names): new_cname = form.cleaned_data[field_prefix + 'output_%s' % idx] if form.cleaned_data['out_column_suffix']: new_cname += form.cleaned_data['out_column_suffix'] plugin_instance.output_column_names[idx] = new_cname # Pack the parameters params = dict() for idx, tpl in enumerate(plugin_instance.parameters): params[tpl[0]] = form.cleaned_data[field_prefix + 'parameter_%s' % idx] # Execute the plugin result_df, status = run_plugin(plugin_instance, sub_df, form.cleaned_data['merge_key'], params) if status is not None: context['exec_status'] = status # Log the event logs.ops.put(request.user, 'plugin_execute', workflow, { 'id': plugin_info.id, 'name': plugin_info.name, 'status': status }) return render(request, 'dataops/plugin_execution_report.html', context) # Additional checks # Result has the same number of rows if result_df.shape[0] != dst_df.shape[0]: status = 'Incorrect number of rows in result data frame.' context['exec_status'] = status # Log the event logs.ops.put(request.user, 'plugin_execute', workflow, { 'id': plugin_info.id, 'name': plugin_info.name, 'status': status }) return render(request, 'dataops/plugin_execution_report.html', context) # Result column names are consistent if set(result_df.columns) != \ set([form.cleaned_data['merge_key']] + plugin_instance.output_column_names): status = 'Incorrect columns in result data frame.' context['exec_status'] = status # Log the event logs.ops.put(request.user, 'plugin_execute', workflow, { 'id': plugin_info.id, 'name': plugin_info.name, 'status': status }) return render(request, 'dataops/plugin_execution_report.html', context) # Proceed with the merge try: result = ops.perform_dataframe_upload_merge( workflow.id, dst_df, result_df, { 'how_merge': 'left', 'dst_selected_key': form.cleaned_data['merge_key'], 'src_selected_key': form.cleaned_data['merge_key'], 'initial_column_names': list(result_df.columns), 'rename_column_names': list(result_df.columns), 'columns_to_upload': [True] * len(list(result_df.columns)) }) except Exception as e: context['exec_status'] = e.message return render(request, 'dataops/plugin_execution_report.html', context) if isinstance(result, str): # Something went wrong context['exec_status'] = result return render(request, 'dataops/plugin_execution_report.html', context) # Get the resulting dataframe final_df = pandas_db.load_from_db(workflow.id) # Update execution time plugin_info.executed = datetime.now( pytz.timezone(ontask_settings.TIME_ZONE)) plugin_info.save() # List of pairs (column name, column type) in the result to create the # log event result_columns = zip(list(result_df.columns), pandas_db.df_column_types_rename(result_df)) # Log the event logs.ops.put( request.user, 'plugin_execute', workflow, { 'id': plugin_info.id, 'name': plugin_info.name, 'status': status, 'result_columns': result_columns }) # Create the table information to show in the report. column_info = [] dst_names = list(dst_df.columns) result_names = list(result_df.columns) for c in list(final_df.columns): if c not in result_names: column_info.append((c, '')) elif c not in dst_names: column_info.append(('', c + ' (New)')) else: if c == form.cleaned_data['merge_key']: column_info.append((c, c)) else: column_info.append((c + ' (Update)', c)) context['info'] = column_info context['key'] = form.cleaned_data['merge_key'] context['id'] = workflow.id # Redirect to the notification page with the proper info return render(request, 'dataops/plugin_execution_report.html', context)
def store_table_in_db(data_frame, pk, table_name, temporary=False, reset_keys=True): """ Update or create a table in the DB with the data in the data frame. It also updates the corresponding column information :param data_frame: Data frame to dump to DB :param pk: Corresponding primary key of the workflow :param table_name: Table to use in the DB :param temporary: Boolean stating if the table is temporary, or it belongs to an existing workflow. :param reset_keys: Reset the value of the field is_key computing it from scratch :return: If temporary = True, then return a list with three lists: - column names - column types - column is unique If temporary = False, return None. All this info is stored in the workflow """ if settings.DEBUG: print('Storing table ', table_name) # get column names df_column_names = list(data_frame.columns) # if the data frame is temporary, the procedure is much simpler if temporary: # Get the if the columns have unique values per row column_unique = are_unique_columns(data_frame) # Store the table in the DB store_table(data_frame, table_name) # Get the column types df_column_types = df_column_types_rename(table_name) # Return a list with three list with information about the # data frame that will be needed in the next steps return [df_column_names, df_column_types, column_unique] # We are modifying an existing DF # Get the workflow and its columns workflow = Workflow.objects.get(id=pk) wf_cols = workflow.columns.all() # Loop over the columns in the Workflow to refresh the is_key value. There # may be values that have been added to the column, so this field needs to # be reassessed for col in wf_cols: if reset_keys: new_val = is_unique_column(data_frame[col.name]) if col.is_key and not new_val: # Only set the is_key value if the column states that it is a # key column, but the values say no. Othe other way around # is_key is false in the column will be ignored as it may have # been set by the user col.is_key = new_val col.save() # Remove this column name from wf_col_names, no further processing is # needed. df_column_names.remove(col.name) # Loop over the remaining columns in the data frame and create the new # column objects in the workflow for cname in df_column_names: # Create the new column column = Column( name=cname, workflow=workflow, data_type=pandas_datatype_names[data_frame[cname].dtype.name], is_key=is_unique_column(data_frame[cname]), position=workflow.columns.count() + 1 ) column.save() # Get the new set of columns with names wf_columns = workflow.columns.all() # Reorder the columns in the data frame data_frame = data_frame[[x.name for x in wf_columns]] # Store the table in the DB store_table(data_frame, table_name) # Review the column types because some "objects" are stored as booleans column_types = df_column_types_rename(table_name) for ctype, col in zip(column_types, wf_columns): if col.data_type != ctype: # If the column type in the DB is different from the one in the # object, update col.data_type = ctype col.save() # Update workflow fields and save workflow.nrows = data_frame.shape[0] workflow.ncols = data_frame.shape[1] workflow.set_query_builder_ops() workflow.data_frame_table_name = table_name workflow.save() return None