Exemple #1
0
    def test_df_equivalent_after_sql(self):

        # Parse the CSV
        df_source = pandas_db.load_df_from_csvfile(
            StringIO.StringIO(self.csv1), 0, 0)

        # Store the DF in the DB
        pandas_db.store_table(df_source, self.table_name)

        # Load it from the DB
        df_dst = pandas_db.load_table(self.table_name)

        # Data frames mut be identical
        assert df_source.equals(df_dst)
Exemple #2
0
    def df_equivalent_after_sql(self):

        # Parse the CSV
        df_source = pandas_db.load_df_from_csvfile(
            StringIO.StringIO(self.csv1), 0, 0)

        # Store the DF in the DB
        pandas_db.store_table(df_source, self.table_name)

        # Load it from the DB
        df_dst = pandas_db.load_table(self.table_name)

        # Columns have to have the same values (None and NaN are
        # different)
        for x in df_source.columns:
            np.testing.assert_array_equal(
                np.array(df_source[x], dtype=unicode),
                np.array(df_dst[x], dtype=unicode))
Exemple #3
0
def store_table_in_db(data_frame, pk, table_name, temporary=False):
    """
    Update or create a table in the DB with the data in the data frame. It
    also updates the corresponding column information

    :param data_frame: Data frame to dump to DB
    :param pk: Corresponding primary key of the workflow
    :param table_name: Table to use in the DB
    :param temporary: Boolean stating if the table is temporary,
           or it belongs to an existing workflow.
    :return: If temporary = True, then return a list with three lists:
             - column names
             - column types
             - column is unique
             If temporary = False, return None. All this info is stored in
             the workflow
    """

    if settings.DEBUG:
        print('Storing table ', table_name)

    # get column names
    df_column_names = list(data_frame.columns)

    # if the data frame is temporary, the procedure is much simpler
    if temporary:
        # Get the if the columns have unique values per row
        column_unique = are_unique_columns(data_frame)

        # Store the table in the DB
        store_table(data_frame, table_name)

        # Get the column types
        df_column_types = df_column_types_rename(table_name)

        # Return a list with three list with information about the
        # data frame that will be needed in the next steps
        return [df_column_names, df_column_types, column_unique]

    # We are modifying an existing DF

    # Get the workflow and its columns
    workflow = Workflow.objects.get(id=pk)
    wf_col_names = Column.objects.filter(workflow__id=pk).values_list(
        "name", flat=True)

    # Loop over the columns in the data frame and reconcile the column info
    # with the column objects attached to the WF
    for cname in df_column_names:
        # See if this is a new column
        if cname in wf_col_names:
            # If column already exists in wf_col_names, no need to do anything
            continue

        # Create the new column
        column = Column(
            name=cname,
            workflow=workflow,
            data_type=pandas_datatype_names[data_frame[cname].dtype.name],
            is_key=is_unique_column(data_frame[cname]),
            position=Column.objects.filter(workflow=workflow).count() + 1,
        )
        column.save()

    # Get now the new set of columns with names
    wf_columns = Column.objects.filter(workflow__id=pk)

    # Reorder the columns in the data frame
    data_frame = data_frame[[x.name for x in wf_columns]]

    # Store the table in the DB
    store_table(data_frame, table_name)

    # Review the column types because some "objects" are stored as booleans
    column_types = df_column_types_rename(table_name)
    for ctype, col in zip(column_types, wf_columns):
        if col.data_type != ctype:
            # If the column type in the DB is different from the one in the
            # object, update
            col.data_type = ctype
            col.save()

    # Update workflow fields and save
    workflow.nrows = data_frame.shape[0]
    workflow.ncols = data_frame.shape[1]
    workflow.set_query_builder_ops()
    workflow.data_frame_table_name = table_name
    workflow.save()

    return None
Exemple #4
0
    def test_eval_sql(self):

        # Create the dataframe with the variables
        df = pd.DataFrame([
            (1, 2.0, True, 'xxx', datetime.datetime(2018, 01, 01, 00, 00, 00)),
            (None, None, None, None, None)
        ],
                          columns=[
                              'v_integer', 'v_double', 'v_boolean', 'v_string',
                              'v_datetime'
                          ])

        # Store the data frame
        pandas_db.store_table(df, 'TEST_TABLE')

        #
        # EQUAL
        #
        self.do_sql_operand('number', '{0}equal', 'integer', '1')
        self.do_sql_operand('number', '{0}equal', 'double', '2.0')
        self.do_sql_operand('number', '{0}equal', 'boolean', '1')
        self.do_sql_operand('text', '{0}equal', 'string', 'xxx')
        self.do_sql_operand('text', '{0}equal', 'datetime',
                            '2018-01-01T00:00:00')

        #
        # BEGINS WITH
        #
        self.do_sql_operand('text', '{0}begins_with', 'string', 'x')

        #
        # CONTAINS
        #
        self.do_sql_operand('text', '{0}contains', 'string', 'xx')
        #
        # ENDS WITH
        #
        self.do_sql_operand('text', '{0}ends_with', 'string', 'xx')

        #
        # IS EMPTY
        #
        self.do_sql_operand('number', 'is_{0}empty', 'integer', None)
        self.do_sql_operand('number', 'is_{0}empty', 'double', None)
        self.do_sql_operand('text', 'is_{0}empty', 'string', None)
        self.do_sql_operand('text', 'is_{0}empty', 'boolean', None)
        self.do_sql_operand('text', 'is_{0}empty', 'datetime', None)

        #
        # LESS
        #
        self.do_sql_operand('number', 'less', 'integer', '2')
        self.do_sql_operand('number', 'less', 'double', '3.2')
        self.do_sql_operand('text', 'less', 'datetime', '2018-01-02T00:00:00')

        #
        # LESS OR EQUAL
        #
        self.do_sql_operand('number', 'less_or_equal', 'integer', '1')
        self.do_sql_operand('number', 'less_or_equal', 'double', '2.0')
        self.do_sql_operand('text', 'less_or_equal', 'datetime',
                            '2018-01-01T00:00:00')

        #
        # GREATER
        #
        self.do_sql_operand('number', 'greater', 'integer', '0')
        self.do_sql_operand('number', 'greater', 'double', '1.2')
        self.do_sql_operand('text', 'greater', 'datetime',
                            '2017-01-01T00:00:00')

        #
        # GREATER OR EQUAL
        #
        self.do_sql_operand('number', 'greater_or_equal', 'integer', '1')
        self.do_sql_operand('number', 'greater_or_equal', 'double', '2.0')
        self.do_sql_operand('text', 'greater_or_equal', 'datetime',
                            '2018-01-01T00:00:00')

        #
        # BETWEEN
        #
        self.do_sql_operand('number', '{0}between', 'integer', ['0', '2'])
        self.do_sql_operand('number', '{0}between', 'double', ['1.2', '2.2'])
        self.do_sql_operand('text', '{0}between', 'datetime',
                            ['2017-01-01T00:00:00', '2018-09-13T00:00:00'])
Exemple #5
0
def store_table_in_db(data_frame,
                      pk,
                      table_name,
                      temporary=False,
                      reset_keys=True):
    """
    Update or create a table in the DB with the data in the data frame. It
    also updates the corresponding column information

    :param data_frame: Data frame to dump to DB
    :param pk: Corresponding primary key of the workflow
    :param table_name: Table to use in the DB
    :param temporary: Boolean stating if the table is temporary,
           or it belongs to an existing workflow.
    :param reset_keys: Reset the value of the field is_key computing it from
           scratch
    :return: If temporary = True, then return a list with three lists:
             - column names
             - column types
             - column is unique
             If temporary = False, return None. All this info is stored in
             the workflow
    """

    if settings.DEBUG:
        print('Storing table ', table_name)

    # get column names
    df_column_names = list(data_frame.columns)

    # if the data frame is temporary, the procedure is much simpler
    if temporary:
        # Get the if the columns have unique values per row
        column_unique = are_unique_columns(data_frame)

        # Store the table in the DB
        store_table(data_frame, table_name)

        # Get the column types
        df_column_types = df_column_types_rename(table_name)

        # Return a list with three list with information about the
        # data frame that will be needed in the next steps
        return [df_column_names, df_column_types, column_unique]

    # We are modifying an existing DF

    # Get the workflow and its columns
    workflow = Workflow.objects.get(id=pk)
    wf_cols = workflow.columns.all()

    # Loop over the columns in the Workflow to refresh the is_key value. There
    # may be values that have been added to the column, so this field needs to
    # be reassessed
    for col in wf_cols:
        if reset_keys:
            new_val = is_unique_column(data_frame[col.name])
            if col.is_key and not new_val:
                # Only set the is_key value if the column states that it is a
                # key column, but the values say no. Othe other way around
                # is_key is false in the column will be ignored as it may have
                # been set by the user
                col.is_key = new_val
                col.save()

        # Remove this column name from wf_col_names, no further processing is
        # needed.
        df_column_names.remove(col.name)

    # Loop over the remaining columns in the data frame and create the new
    # column objects in the workflow
    for cname in df_column_names:
        # Create the new column
        column = Column(
            name=cname,
            workflow=workflow,
            data_type=pandas_datatype_names[data_frame[cname].dtype.name],
            is_key=is_unique_column(data_frame[cname]),
            position=workflow.columns.count() + 1
        )
        column.save()

    # Get the new set of columns with names
    wf_columns = workflow.columns.all()

    # Reorder the columns in the data frame
    data_frame = data_frame[[x.name for x in wf_columns]]

    # Store the table in the DB
    store_table(data_frame, table_name)

    # Review the column types because some "objects" are stored as booleans
    column_types = df_column_types_rename(table_name)
    for ctype, col in zip(column_types, wf_columns):
        if col.data_type != ctype:
            # If the column type in the DB is different from the one in the
            # object, update
            col.data_type = ctype
            col.save()

    # Update workflow fields and save
    workflow.nrows = data_frame.shape[0]
    workflow.ncols = data_frame.shape[1]
    workflow.set_query_builder_ops()
    workflow.data_frame_table_name = table_name
    workflow.save()

    return None