Example #1
0
def store_temporary_dataframe(
    data_frame: pd.DataFrame,
    workflow,
):
    """Store a temporary dataframe.

    :param data_frame: Data frame to store
    :param workflow: Data frame will belong to this workflow
    :return: List of three lists:
        - Data frame columns
        - Column types (OnTask)
        - List of booleans denoting if the column is unique
    """
    table_name = workflow.get_upload_table_name()

    # Get the if the columns have unique values per row
    column_unique = pandas.are_unique_columns(data_frame)

    # Store the table in the DB
    pandas.store_table(data_frame, table_name)

    # Get the column types
    df_column_types = sql.get_df_column_types(table_name)

    # Return a list with three list with information about the
    # data frame that will be needed in the next steps
    return [list(data_frame.columns), df_column_types, column_unique]
Example #2
0
    def upload_column_name_too_long(self):
        """Use the table store to detect column names that are too long."""
        data_frame = load_df_from_csvfile(io.StringIO(self.csv), 0, 0)

        self.assertTrue(
            any(len(cname) > COLUMN_NAME_SIZE for cname in data_frame.columns))

        try:
            store_table(data_frame, 'TABLE_NAME')
        except Exception as exc:
            self.assertTrue('Column name is longer than' in str(exc))
        else:
            raise Exception('Column with long name is not detected')
Example #3
0
    def parse_data_frames(self):
        # Parse the two CSV strings and return as data frames

        if self.workflow:
            # Get the workflow data frame
            df_dst = load_table(self.workflow.get_data_frame_table_name())
        else:
            df_dst = load_df_from_csvfile(io.StringIO(self.csv1), 0, 0)

        df_src = load_df_from_csvfile(io.StringIO(self.csv2), 0, 0)
        store_table(df_src, 'TEMPORARY_TABLE')
        df_src = load_table('TEMPORARY_TABLE')
        # Fix the merge_info fields.

        self.merge_info['initial_column_names'] = list(df_src.columns)
        self.merge_info['rename_column_names'] = list(df_src.columns)
        self.merge_info['columns_to_upload'] = list(df_src.columns)

        return df_dst, df_src
Example #4
0
    def test_df_equivalent_after_sql(self):

        # Parse the CSV
        df_source = services.load_df_from_csvfile(io.StringIO(self.csv1), 0, 0)

        # Store the DF in the DB
        pandas.store_table(df_source, self.table_name)

        # Load it from the DB
        df_dst = pandas.load_table(self.table_name)

        # NaN in boolean columns are now None
        df_source['bool1'] = df_source['bool1'].where(
            pd.notnull(df_source['bool1']), None)
        df_source['bool2'] = df_source['bool2'].where(
            pd.notnull(df_source['bool2']), None)

        # Data frames mut be identical
        assert df_source.equals(df_dst)
Example #5
0
    def test_df_equivalent_after_sql(self):

        # Parse the CSV
        df_source = load_df_from_csvfile(io.StringIO(self.csv1), 0, 0)

        # Store the DF in the DB
        store_table(df_source, self.table_name)

        # Load it from the DB
        df_dst = load_table(self.table_name)

        # NaN in boolean columns are now None
        df_source['bool1'] = df_source['bool1'].where(
            pd.notnull(df_source['bool1']), None)
        df_source['bool2'] = df_source['bool2'].where(
            pd.notnull(df_source['bool2']), None)

        # Datetime need to be localized to the local timezone
        df_dst['date1'] = df_dst['date1'].dt.tz_convert(settings.TIME_ZONE)
        df_dst['date2'] = df_dst['date2'].dt.tz_convert(settings.TIME_ZONE)

        # Data frames mut be identical
        assert df_source.equals(df_dst)
Example #6
0
    def test_eval_sql(self):

        # Create the dataframe with the variables
        df = pd.DataFrame(
            [(1, 2.0, True, 'xxx', datetime.datetime(2018, 1, 1, 0, 0, 0)),
             (None, None, None, None, None)],
            columns=self.test_columns)

        # Store the data frame
        store_table(df, 'TEST_TABLE')

        #
        # EQUAL
        #
        self.do_sql_txt_operand('number', '{0}equal', 'integer', '1')
        self.do_sql_txt_operand('number', '{0}equal', 'double', '2.0')
        self.do_sql_txt_operand('select', '{0}equal', 'boolean', 'true')
        self.do_sql_txt_operand('text', '{0}equal', 'string', 'xxx')
        self.do_sql_txt_operand('text', '{0}equal', 'datetime',
                                '2018-01-01T00:00:00')

        #
        # BEGINS WITH
        #
        self.do_sql_txt_operand('text', '{0}begins_with', 'string', 'x')

        #
        # CONTAINS
        #
        self.do_sql_txt_operand('text', '{0}contains', 'string', 'xx')
        #
        # ENDS WITH
        #
        self.do_sql_txt_operand('text', '{0}ends_with', 'string', 'xx')

        #
        # IS EMPTY
        #
        self.do_sql_txt_operand('text', 'is_{0}empty', 'string', None)

        #
        # IS NULL
        #
        self.do_sql_txt_operand('number', 'is_{0}null', 'integer', None)
        self.do_sql_txt_operand('number', 'is_{0}null', 'double', None)
        self.do_sql_txt_operand('select', 'is_{0}null', 'boolean', None)
        self.do_sql_txt_operand('text', 'is_{0}null', 'string', None)
        self.do_sql_txt_operand('text', 'is_{0}null', 'datetime', None)

        #
        # LESS
        #
        self.do_sql_txt_operand('number', 'less', 'integer', '2')
        self.do_sql_txt_operand('number', 'less', 'double', '3.2')
        self.do_sql_txt_operand('text', 'less', 'datetime',
                                '2018-01-02T00:00:00')

        #
        # LESS OR EQUAL
        #
        self.do_sql_txt_operand('number', 'less_or_equal', 'integer', '1')
        self.do_sql_txt_operand('number', 'less_or_equal', 'double', '2.0')
        self.do_sql_txt_operand('text', 'less_or_equal', 'datetime',
                                '2018-01-01T00:00:00')

        #
        # GREATER
        #
        self.do_sql_txt_operand('number', 'greater', 'integer', '0')
        self.do_sql_txt_operand('number', 'greater', 'double', '1.2')
        self.do_sql_txt_operand('text', 'greater', 'datetime',
                                '2017-01-01T00:00:00')

        #
        # GREATER OR EQUAL
        #
        self.do_sql_txt_operand('number', 'greater_or_equal', 'integer', '1')
        self.do_sql_txt_operand('number', 'greater_or_equal', 'double', '2.0')
        self.do_sql_txt_operand('text', 'greater_or_equal', 'datetime',
                                '2018-01-01T00:00:00')

        #
        # BETWEEN
        #
        self.do_sql_txt_operand('number', '{0}between', 'integer', ['0', '2'],
                                1, 0)
        self.do_sql_txt_operand('number', '{0}between', 'double',
                                ['1.2', '2.2'], 1, 0)
        self.do_sql_txt_operand('text', '{0}between', 'datetime',
                                ['2017-01-01T00:00:00', '2018-09-13T00:00:00'],
                                1, 0)
Example #7
0
    def create(self, validated_data, **kwargs):
        """Create the new workflow."""
        wflow_name = self.context.get('name')
        if not wflow_name:
            wflow_name = self.validated_data.get('name')
            if not wflow_name:
                raise Exception(_('Unexpected empty workflow name.'))

            if models.Workflow.objects.filter(
                    name=wflow_name, user=self.context['user']).exists():
                raise Exception(
                    _('There is a workflow with this name. ' +
                      'Please provide a workflow name in the import page.'))

        # Initial values
        workflow_obj = None
        try:
            workflow_obj = models.Workflow(
                user=self.context['user'],
                name=wflow_name,
                description_text=validated_data['description_text'],
                nrows=0,
                ncols=0,
                attributes=validated_data['attributes'],
                query_builder_ops=validated_data.get('query_builder_ops', {}),
            )
            workflow_obj.save()

            # Create the columns
            column_data = ColumnSerializer(data=validated_data.get(
                'columns', []),
                                           many=True,
                                           context={'workflow': workflow_obj})
            # And save its content
            if column_data.is_valid():
                columns = column_data.save()
            else:
                raise Exception(_('Unable to save column information'))

            # If there is any column with position = 0, recompute (this is to
            # guarantee backward compatibility.
            if any(col.position == 0 for col in columns):
                for idx, col in enumerate(columns):
                    col.position = idx + 1
                    col.save()

            # Load the data frame
            data_frame = validated_data.get('data_frame')
            if data_frame is not None:
                # Store the table in the DB
                pandas.store_table(
                    data_frame,
                    workflow_obj.get_data_frame_table_name(),
                    dtype={
                        col.name: col.data_type
                        for col in workflow_obj.columns.all()
                    },
                )

                # Reconcile now the information in workflow and columns with
                # the one loaded
                workflow_obj.ncols = validated_data['ncols']
                workflow_obj.nrows = validated_data['nrows']

                workflow_obj.save()

            # Create the actions pointing to the workflow
            action_data = ActionSerializer(data=validated_data.get(
                'actions', []),
                                           many=True,
                                           context={
                                               'workflow': workflow_obj,
                                               'columns': columns
                                           })

            if action_data.is_valid():
                action_data.save()
            else:
                raise Exception(_('Unable to save column information'))

            # Create the views pointing to the workflow
            view_data = ViewSerializer(data=validated_data.get('views', []),
                                       many=True,
                                       context={
                                           'workflow': workflow_obj,
                                           'columns': columns
                                       })

            if view_data.is_valid():
                view_data.save()
            else:
                raise Exception(_('Unable to save column information'))
        except Exception:
            # Get rid of the objects created
            if workflow_obj:
                if workflow_obj.id:
                    workflow_obj.delete()
            raise

        return workflow_obj