def _inner_join(self, table_right: Table, condition): ''' Join table (left) with a supplied table (right) where condition is met. ''' # get columns and operator column_name_left, operator, column_name_right = self._parse_condition( condition, both_columns=True) # try to find both columns, if you fail raise error try: column_index_left = self.column_names.index(column_name_left) column_index_right = table_right.column_names.index( column_name_right) except: raise Exception(f'Columns dont exist in one or both tables.') # get the column names of both tables with the table name in front # ex. for left -> name becomes left_table_name_name etc left_names = [ f'{self._name}_{colname}' for colname in self.column_names ] right_names = [ f'{table_right._name}_{colname}' for colname in table_right.column_names ] # define the new tables name, its column names and types join_table_name = f'{self._name}_join_{table_right._name}' join_table_colnames = left_names + right_names join_table_coltypes = self.column_types + table_right.column_types join_table = Table(name=join_table_name, column_names=join_table_colnames, column_types=join_table_coltypes) # count the number of operations (<,> etc) no_of_ops = 0 # this code is dumb on purpose... it needs to illustrate the underline technique # for each value in left column and right column, if condition, append the corresponding row to the new table for row_left in self.data: left_value = row_left[column_index_left] for row_right in table_right.data: right_value = row_right[column_index_right] no_of_ops += 1 if get_op(operator, left_value, right_value): #EQ_OP join_table._insert(row_left + row_right) print(f'## Select ops no. -> {no_of_ops}') print(f'# Left table size -> {len(self.data)}') print(f'# Right table size -> {len(table_right.data)}') return join_table
def _select_where_with_btree(self, return_columns, bt, condition, order_by=None, asc=False, top_k=None): # if * return all columns, else find the column indexes for the columns specified if return_columns == '*': return_cols = [i for i in range(len(self.column_names))] else: return_cols = [self.column_names.index(colname) for colname in return_columns] column_name, operator, value = self._parse_condition(condition) print("1: ", type(value), " 2: ", self.column_types[self.column_names.index(column_name)]) # if the column in condition is not a primary key, abort the select if column_name != self.column_names[self.pk_idx]: print('Column is not PK. Aborting') # here we run the same select twice, sequentially and using the btree. # we then check the results match and compare performance (number of operation) column = self.columns[self.column_names.index(column_name)] # sequential rows1 = [] opsseq = 0 for ind, x in enumerate(column): opsseq+=1 if get_op(operator, x, value): rows1.append(ind) print(f'Without Btree -> {opsseq} comparison operations') # btree find rows = bt.find(operator, value) print('### Seq result ###') print(rows1) print('### Index result ###') print(rows) # same as simple select from now on rows = rows[:top_k] # TODO: this needs to be dumbed down dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()} dict['column_names'] = [self.column_names[i] for i in return_cols] dict['column_types'] = [self.column_types[i] for i in return_cols] dict['_no_of_columns'] = len(return_cols) if order_by is None: return Table(load=dict) else: return Table(load=dict).order_by(order_by, asc)
def _update_row(self, set_value, set_column, condition): ''' update where Condition ''' # parse the condition column_name, operator, value = self._parse_condition(condition) # get the condition and the set column column = self.column_by_name(column_name) set_column_idx = self.column_names.index(set_column) # set_columns_indx = [self.column_names.index(set_column_name) for set_column_name in set_column_names] # for each value in column, if condition, replace it with set_value for row_ind, column_value in enumerate(column): if get_op(operator, column_value, value): self.data[row_ind][set_column_idx] = set_value
def _select_where(self, return_columns, condition=None, order_by=None, asc=False, top_k=None): ''' Select and return a table containing specified columns and rows where condition is met ''' # if * return all columns, else find the column indexes for the columns specified if return_columns == '*': return_cols = [i for i in range(len(self.column_names))] elif isinstance(return_columns, str): raise Exception( f'Return columns should be "*" or of type list. (the second parameter is return_columns not condition)') else: return_cols = [self.column_names.index(colname) for colname in return_columns] # if condition is None, return all rows # if not, return the rows with values where condition is met for value if condition is not None: column_name, operator, value = self._parse_condition(condition) column = self.columns[self.column_names.index(column_name)] rows = [ind for ind, x in enumerate(column) if get_op(operator, x, value)] else: rows = [i for i in range(len(self.columns[0]))] # top k rows rows = rows[:top_k] # copy the old dict, but only the rows and columns of data with index in rows/columns (the indexes that we want returned) dict = {(key): ([[self.data[i][j] for j in return_cols] for i in rows] if key == "data" else value) for key, value in self.__dict__.items()} # we need to set the new column names/types and no of columns, since we might # only return some columns dict['column_names'] = [self.column_names[i] for i in return_cols] dict['column_types'] = [self.column_types[i] for i in return_cols] dict['_no_of_columns'] = len(return_cols) # order by the return table if specified if order_by is None: return Table(load=dict) else: return Table(load=dict).order_by(order_by, asc)
def _delete_where(self, condition): ''' Deletes rows where condition is met. Important: delete replaces the rows to be deleted with rows filled with Nones. These rows are then appended to the insert_stack. Args: condition: string. A condition using the following format: 'column[<,<=,==,>=,>]value' or 'value[<,<=,==,>=,>]column'. Operatores supported: (<,<=,==,>=,>) ''' column_name, operator, value = self._parse_condition(condition) indexes_to_del = [] column = self.column_by_name(column_name) for index, row_value in enumerate(column): if get_op(operator, row_value, value): indexes_to_del.append(index) # we pop from highest to lowest index in order to avoid removing the wrong item # since we dont delete, we dont have to to pop in that order, but since delete is used # to delete from meta tables too, we still implement it. for index in sorted(indexes_to_del, reverse=True): if self._name[:4] != 'meta': # if the table is not a metatable, replace the row with a row of nones self.data[index] = [ None for _ in range(len(self.column_names)) ] else: self.data.pop(index) # self._update() # we have to return the deleted indexes, since they will be appended to the insert_stack return indexes_to_del
def _update_row(self, set_value, set_column, condition): ''' update where Condition ''' # parse the condition column_name, operator, value = self._parse_condition(condition) # get the condition and the set column column = self.columns[self.column_names.index(column_name)] if isinstance(set_column, str): set_column_idx = self.column_names.index(set_column) # set_columns_indx = [self.column_names.index(set_column_name) for set_column_name in set_column_names] # check if user tries to set same values for p_k column if set_column_idx == self.pk_idx and set_value in self.columns[ set_column_idx]: raise ValueError( f'## ERROR -> Column {set_column} is primary key and can not have same values.' ) # for each value in column, if condition, replace it with set_value for row_ind, column_value in enumerate(column): if get_op(operator, column_value, value): self.data[row_ind][set_column_idx] = set_value self._update()
def _outer_join(self, table_right: Table, condition): ''' Join table (left) with a supplied table (right). Show all rows from both tables and match the rows where the condition is met ''' # get columns and operator column_name_left, operator, column_name_right = self._parse_condition(condition, join=True) # try to find both columns, if you fail raise error try: column_index_left = self.column_names.index(column_name_left) column_index_right = table_right.column_names.index(column_name_right) except: raise Exception(f'Columns dont exist in one or both tables.') # get the column names of both tables with the table name in front # ex. for left -> name becomes left_table_name_name etc left_names = [f'{self._name}_{colname}' for colname in self.column_names] right_names = [f'{table_right._name}_{colname}' for colname in table_right.column_names] # define the new tables name, its column names and types join_table_name = f'{self._name}_outer_join_{table_right._name}' join_table_colnames = left_names+right_names join_table_coltypes = self.column_types+table_right.column_types join_table = Table(name=join_table_name, column_names=join_table_colnames, column_types= join_table_coltypes) # count the number of operations (<,> etc) no_of_ops = 0 # this code is dumb on purpose... it needs to illustrate the underline technique # for each value in left column and right column, if condition, append the corresponding row to the new table row_null =[] #outer join works with a combination of the left and right join #ONLY DIFFERENCE when doing the right_join part we dont insert if there is a match. Only when there is not for row_left in self.data: row_null.clear() hasMatch = False left_value = row_left[column_index_left] for row_right in table_right.data: right_value = row_right[column_index_right] no_of_ops+=1 if get_op(operator, left_value, right_value): #EQ_OP join_table._insert(row_left+row_right) hasMatch = True if not hasMatch: for column in table_right.column_types: if column == type(1) or column == type(1.2): row_null.append(0) else: row_null.append(None) join_table._insert(row_left + row_null) for row_right in table_right.data: row_null.clear() hasMatch = False right_value = row_right[column_index_right] for row_left in self.data: left_value = row_left[column_index_left] no_of_ops+=1 if get_op(operator, left_value, right_value): #EQ_OP hasMatch = True if not hasMatch: for column in self.column_types: if column == type(1) or column == type(1.2): row_null.append(0) else: row_null.append(None) join_table._insert(row_null + row_right) print(f'## Select ops no. -> {no_of_ops}') print(f'# Left table size -> {len(self.data)}') print(f'# Right table size -> {len(table_right.data)}') return join_table
def _inner_join(self, table_right: Table, condition): ''' Join table (left) with a supplied table (right) where condition is met. Args: condition: string. A condition using the following format: 'column[<,<=,==,>=,>]value' or 'value[<,<=,==,>=,>]column'. Operatores supported: (<,<=,==,>=,>) ''' # get columns and operator column_name_left, operator, column_name_right = self._parse_condition( condition, join=True) # try to find both columns, if you fail raise error try: column_index_left = self.column_names.index(column_name_left) except: raise Exception( f'Column "{column_name_left}" dont exist in left table. Valid columns: {self.column_names}.' ) try: column_index_right = table_right.column_names.index( column_name_right) except: raise Exception( f'Column "{column_name_right}" dont exist in right table. Valid columns: {table_right.column_names}.' ) # get the column names of both tables with the table name in front # ex. for left -> name becomes left_table_name_name etc left_names = [ f'{self._name}.{colname}' if self._name != '' else colname for colname in self.column_names ] right_names = [ f'{table_right._name}.{colname}' if table_right._name != '' else colname for colname in table_right.column_names ] # define the new tables name, its column names and types join_table_name = '' join_table_colnames = left_names + right_names join_table_coltypes = self.column_types + table_right.column_types join_table = Table(name=join_table_name, column_names=join_table_colnames, column_types=join_table_coltypes) # count the number of operations (<,> etc) no_of_ops = 0 # this code is dumb on purpose... it needs to illustrate the underline technique # for each value in left column and right column, if condition, append the corresponding row to the new table for row_left in self.data: left_value = row_left[column_index_left] for row_right in table_right.data: right_value = row_right[column_index_right] no_of_ops += 1 if get_op(operator, left_value, right_value): #EQ_OP join_table._insert(row_left + row_right) return join_table
def _select_where(self, return_columns, condition=None, order_by=None, desc=True, top_k=None): ''' Select and return a table containing specified columns and rows where condition is met. Args: return_columns: list. The columns to be returned. condition: string. A condition using the following format: 'column[<,<=,==,>=,>]value' or 'value[<,<=,==,>=,>]column'. Operatores supported: (<,<=,==,>=,>) order_by: string. A column name that signals that the resulting table should be ordered based on it (no order if None). desc: boolean. If True, order_by will return results in descending order (False by default). top_k: int. An integer that defines the number of rows that will be returned (all rows if None). ''' # if * return all columns, else find the column indexes for the columns specified if return_columns == '*': return_cols = [i for i in range(len(self.column_names))] else: return_cols = [ self.column_names.index(col.strip()) for col in return_columns.split(',') ] # if condition is None, return all rows # if not, return the rows with values where condition is met for value if condition is not None: column_name, operator, value = self._parse_condition(condition) column = self.column_by_name(column_name) rows = [ ind for ind, x in enumerate(column) if get_op(operator, x, value) ] else: rows = [i for i in range(len(self.data))] # top k rows # rows = rows[:int(top_k)] if isinstance(top_k,str) else rows # copy the old dict, but only the rows and columns of data with index in rows/columns (the indexes that we want returned) dict = {(key): ([[self.data[i][j] for j in return_cols] for i in rows] if key == "data" else value) for key, value in self.__dict__.items()} # we need to set the new column names/types and no of columns, since we might # only return some columns dict['column_names'] = [self.column_names[i] for i in return_cols] dict['column_types'] = [self.column_types[i] for i in return_cols] s_table = Table(load=dict) if order_by: s_table.order_by(order_by, desc) s_table.data = s_table.data[:int(top_k)] if isinstance( top_k, str) else s_table.data return s_table
def _full_outer_join(self, table_right: Table, condition): # get columns and operator column_name_left, operator, column_name_right = self._parse_condition( condition, join=True) # try to find both columns, if you fail raise error try: column_index_left = self.column_names.index(column_name_left) column_index_right = table_right.column_names.index( column_name_right) except: raise Exception(f'Columns dont exist in one or both tables.') # get the column names of both tables with the table name in front # ex. for left -> name becomes left_table_name_name etc left_names = [ f'{self._name}_{colname}' for colname in self.column_names ] right_names = [ f'{table_right._name}_{colname}' for colname in table_right.column_names ] # define the new tables name, its column names and types join_table_name = f'{self._name}_full_outer_join_{table_right._name}' join_table_colnames = left_names + right_names join_table_coltypes = self.column_types + table_right.column_types join_table = Table(name=join_table_name, column_names=join_table_colnames, column_types=join_table_coltypes) # count the number of operations (<,> etc) no_of_ops = 0 nulls = [] #enter 0 as many as the columns of the right table SameValues = [] #values that we have already enter in the join_table newrow = 0 #counts rows in join_table # for each value in left column and right column, if condition, append the corresponding row to the new table for row_left in self.data: nulls.clear() left_value = row_left[column_index_left] found = False for row_right in table_right.data: right_value = row_right[column_index_right] no_of_ops += 1 if get_op(operator, left_value, right_value): #EQ_OP join_table._insert(row_left + row_right) newrow += 1 # new row in join table found = True SameValues.append( row_right) #keep the values that we have found match if not found: # if not fount match in the right table, then for every column of the right table enter 0 to nulls for columns in range(table_right._no_of_columns): nulls.append(0) join_table._insert( row_left + nulls ) #append left table's values and nulls into join table newrow += 1 # new row in join table for columns in range(table_right._no_of_columns ): # change every 0 column to Null join_table.data[newrow - 1][columns + self._no_of_columns] = 'Null' join_table._update() #for each value in right column, check if it exists in SameValues. If not then add it to the join_table with null in left columns for row_right in table_right.data: right = False #found right column in SameVlaues nulls.clear() for j in range(len(SameValues)): if SameValues[j] == row_right: right = True if not right: for columns in range(self._no_of_columns): nulls.append(0) join_table._insert( nulls + row_right ) #append right table's values and nulls into join table newrow += 1 # new row in join table for columns in range( self._no_of_columns): # change every 0 column to Null join_table.data[newrow - 1][columns] = 'Null' join_table._update() print(f'## Select ops no. -> {no_of_ops}') print(f'# Left table size -> {len(self.data)}') print(f'# Right table size -> {len(table_right.data)}') return join_table
def _update_row(self, set_value, set_column, condition): ''' update where Condition ''' # parse the condition column_name, operator, value = self._parse_condition(condition) # get the condition and the set column column = self.columns[self.column_names.index(column_name)] set_column_idx = self.column_names.index(set_column) # set_columns_indx = [self.column_names.index(set_column_name) for set_column_name in set_column_names] # for each value in column, if condition, replace it with set_value for row_ind, column_value in enumerate(column): if get_op(operator, column_value, value): self.data[row_ind][set_column_idx] = set_value self._update() def _delete_where(self, condition): ''' Deletes rows where condition is met. Important: delete replaces the rows to be deleted with rows filled with Nones, These rows are then appened to the insert_stack ''' column_name, operator, value = self._parse_condition(condition) indexes_to_del = [] column = self.columns[self.column_names.index(column_name)] for index, row_value in enumerate(column): if get_op(operator, row_value, value): indexes_to_del.append(index) # we pop from highest to lowest index in order to avoid removing the wrong item # since we dont delete, we dont have to to pop in that order, but since delete is used # to delete from meta tables too, we still implement it. for index in sorted(indexes_to_del, reverse=True): if self._name[:4] != 'meta': # if the table is not a metatable, replace the row with a row of nones self.data[index] = [None for _ in range(len(self.column_names))] else: self.data.pop(index) self._update() print(f"Deleted {len(indexes_to_del)} rows") # we have to return the deleted indexes, since they will be appended to the insert_stack return indexes_to_del def _select_where(self, return_columns, condition=None, order_by=None, asc=False, top_k=None): ''' Select and return a table containing specified columns and rows where condition is met ''' # if * return all columns, else find the column indexes for the columns specified if return_columns == '*': return_cols = [i for i in range(len(self.column_names))] elif isinstance(return_columns, str): raise Exception(f'Return columns should be "*" or of type list. (the second parameter is return_columns not condition)') else: return_cols = [self.column_names.index(colname) for colname in return_columns] # if condition is None, return all rows # if not, return the rows with values where condition is met for value if condition is not None: column_name, operator, value = self._parse_condition(condition) column = self.columns[self.column_names.index(column_name)] rows = [ind for ind, x in enumerate(column) if get_op(operator, x, value)] else: rows = [i for i in range(len(self.columns[0]))] # top k rows rows = rows[:top_k] # copy the old dict, but only the rows and columns of data with index in rows/columns (the indexes that we want returned) dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()} # we need to set the new column names/types and no of columns, since we might # only return some columns dict['column_names'] = [self.column_names[i] for i in return_cols] dict['column_types'] = [self.column_types[i] for i in return_cols] dict['_no_of_columns'] = len(return_cols) # order by the return table if specified if order_by is None: return Table(load=dict) else: return Table(load=dict).order_by(order_by, asc) def _select_where_with_btree(self, return_columns, bt, condition, order_by=None, asc=False, top_k=None): # if * return all columns, else find the column indexes for the columns specified if return_columns == '*': return_cols = [i for i in range(len(self.column_names))] else: return_cols = [self.column_names.index(colname) for colname in return_columns] column_name, operator, value = self._parse_condition(condition) print("1: ", type(value), " 2: ", self.column_types[self.column_names.index(column_name)]) # if the column in condition is not a primary key, abort the select if column_name != self.column_names[self.pk_idx]: print('Column is not PK. Aborting') # here we run the same select twice, sequentially and using the btree. # we then check the results match and compare performance (number of operation) column = self.columns[self.column_names.index(column_name)] # sequential rows1 = [] opsseq = 0 for ind, x in enumerate(column): opsseq+=1 if get_op(operator, x, value): rows1.append(ind) print(f'Without Btree -> {opsseq} comparison operations') # btree find rows = bt.find(operator, value) print('### Seq result ###') print(rows1) print('### Index result ###') print(rows) # same as simple select from now on rows = rows[:top_k] # TODO: this needs to be dumbed down dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()} dict['column_names'] = [self.column_names[i] for i in return_cols] dict['column_types'] = [self.column_types[i] for i in return_cols] dict['_no_of_columns'] = len(return_cols) if order_by is None: return Table(load=dict) else: return Table(load=dict).order_by(order_by, asc) def order_by(self, column_name, asc=False): ''' Order by based on column ''' # get column, sort values and return sorted indexes column = self.columns[self.column_names.index(column_name)] idx = sorted(range(len(column)), key=lambda k: column[k], reverse=not asc) # return table but arange data using idx list (sorted indexes) dict = {(key):([self.data[i] for i in idx] if key=="data" else value) for key, value in self.__dict__.items()} return Table(load=dict) def _sort(self, column_name, asc=False): ''' Same as order by, but its persistant ''' column = self.columns[self.column_names.index(column_name)] idx = sorted(range(len(column)), key=lambda k: column[k], reverse=not asc) # print(idx) self.data = [self.data[i] for i in idx] self._update() def _inner_join(self, table_right: Table, condition): ''' Join table (left) with a supplied table (right) where condition is met. ''' # get columns and operator column_name_left, operator, column_name_right = self._parse_condition(condition, both_columns=True) # try to find both columns, if you fail raise error try: column_index_left = self.column_names.index(column_name_left) column_index_right = table_right.column_names.index(column_name_right) except: raise Exception(f'Columns dont exist in one or both tables.') # get the column names of both tables with the table name in front # ex. for left -> name becomes left_table_name_name etc left_names = [f'{self._name}_{colname}' for colname in self.column_names] right_names = [f'{table_right._name}_{colname}' for colname in table_right.column_names] # define the new tables name, its column names and types join_table_name = f'{self._name}_join_{table_right._name}' join_table_colnames = left_names+right_names join_table_coltypes = self.column_types+table_right.column_types join_table = Table(name=join_table_name, column_names=join_table_colnames, column_types= join_table_coltypes) # count the number of operations (<,> etc) no_of_ops = 0 # this code is dumb on purpose... it needs to illustrate the underline technique # for each value in left column and right column, if condition, append the corresponding row to the new table for row_left in self.data: left_value = row_left[column_index_left] for row_right in table_right.data: right_value = row_right[column_index_right] no_of_ops+=1 if get_op(operator, left_value, right_value): #EQ_OP join_table._insert(row_left+row_right) print(f'## Select ops no. -> {no_of_ops}') print(f'# Left table size -> {len(self.data)}') print(f'# Right table size -> {len(table_right.data)}') return join_table def show(self, no_of_rows=None, is_locked=False): ''' Pretty print the table ''' # if the table is locked, add locked keyword to title if is_locked: print(f"\n## {self._name} (locked) ##") else: print(f"\n## {self._name} ##") # headers -> "column name (column type)" headers = [f'{col} ({tp.__name__})' for col, tp in zip(self.column_names, self.column_types)] if self.pk_idx is not None: # table has a primary key, add PK next to the appropriate column headers[self.pk_idx] = headers[self.pk_idx]+' #PK#' # detect the rows that are no tfull of nones (these rows have been deleted) # if we dont skip these rows, the returning table has empty rows at the deleted positions non_none_rows = [row for row in self.data if any(row)] # print using tabulate print(tabulate(non_none_rows[:no_of_rows], headers=headers)+'\n') def _parse_condition(self, condition, join=False): ''' Parse the single string condition and return column/s value and operator ''' # if both_columns (used by the join function) return the names of the names of the columns (left first) if join: return split_condition(condition) # cast the value with the specified column's type and return the column name, the operator and the casted value left, op, right = split_condition(condition) if left not in self.column_names: raise ValueError(f'Condition is not valid (cant find column name)') coltype = self.column_types[self.column_names.index(left)] return left, op, coltype(right) def _load_from_file(self, filename): ''' Load table from a pkl file (not used currently) ''' f = open(filename, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict)
def _select_where_O(tb_object, return_columns, condition=None, order_by=None, asc=False, top_k=None): #-->Select all columns if return_columns == '*': return_cols = [i for i in range(len(tb_object.column_names))] #-->Invalid parameter input elif isinstance(return_columns, str): raise Exception( f'Return columns should be "*" or of type list. (the second parameter is return_columns not condition)') #-->Select only specified columns else: return_cols = [tb_object.column_names.index(colname) for colname in return_columns] #-->Return records based on condition select_data = [] if condition is not None: column_name, operator, value = tb_object._parse_condition(condition) #-->Condition's culumn index column = tb_object.column_names.index(column_name) #-->Check if column_name is the primary key column if column_name == tb_object.column_names[tb_object.pk_idx]: #-->PK if operator == '==': #-->PK-identity #-->Check if record exists in main file temp = [row[tb_object.pk_idx] for row in tb_object.sequential_file_data] position = binary_search._binary_search(temp, value) #-->Record exists in main file if position != -1: select_data.append(tb_object.sequential_file_data[position]) # -->Search in insert stack else: if value in [row[tb_object.pk_idx] for row in tb_object.data]: for record in tb_object.data: if get_op('==', record[tb_object.pk_idx], value): select_data.append(record) else: #-->PK-range temp = [row[1] for row in tb_object.order] position, exists = binary_search._binary_search_v2(temp, value) #-->Get the starting position (lower limit) if operator == "<" or operator == "<=": lower_limit = 0 elif operator == ">" and exists == True: lower_limit = position + 1 else: lower_limit = position #-->Get corresponding records from both main file and insert stack i = lower_limit while i < len(temp) and get_op(operator, temp[i], value): #-->Record exists in main file if temp[i] in tb_object.sequential_file_columns[column]: select_data.append(tb_object.sequential_file_data[tb_object.sequential_file_columns[column].index(temp[i])]) #-->Record exists in insert stack elif temp[i] in tb_object.columns[column]: select_data.append(tb_object.data[tb_object.columns[column].index(temp[i])]) i += 1 else: #-->Not PK #-->Get corresponding records from both main file and insert stack for elem in tb_object.sequential_file_data: if get_op(operator, elem[column], value): select_data.append(elem) for elem in tb_object.data: if get_op(operator, elem[column], value): select_data.append(elem) #-->Return all records else: #-->Get all records from both main file and insert stack for elem in tb_object.sequential_file_data: select_data.append(elem) for elem in tb_object.data: select_data.append(elem) select_columns = [[row[i] for row in select_data] for i in range(tb_object._no_of_columns)] #-->Selected records indexes rows = [i for i in range(len(select_columns[0]))] #-->Select only top k records rows = rows[:top_k] #-->Store result table's information in a dictionary dict = {(key): ([[select_data[i][j] for j in return_cols] for i in rows] if key == "data" else value) for key, value in tb_object.__dict__.items()} dict['column_names'] = [tb_object.column_names[i] for i in return_cols] dict['column_types'] = [tb_object.column_types[i] for i in return_cols] dict['_no_of_columns'] = len(return_cols) #-->Return result table if order_by is None: return Table(load=dict) #-->Return result table ordered else: return Table(load=dict).order_by(order_by, asc)
def _select_where_groupby(self, return_columns, aggr_function, where_condition=None, having_condition=None, order_by=None, asc=False, top_k=None): ''' Select and return a table containing specified columns and rows grouped-by (a specific column) where condition is met ''' if (isinstance(return_columns, str)): raise Exception(f'Return columns must be inserted as a list') indx = self.column_names.index(return_columns[-1]) typeList = self.column_types # if attemps to perform aggregate function (other than count) to non numeric values, raise exception if (not (typeList[indx] == int or typeList[indx] == float) and not (aggr_function == 'count')): raise Exception(f'Can not perform ' + aggr_function + ' to non numeric values') if (return_columns == '*'): raise Exception( f'Return columns should be explicitly written and the aggregate function will affect the last column' ) elif isinstance(return_columns, str): raise Exception( f'Return columns should be of type list. (the second parameter is return_columns not condition)' ) else: return_cols = [ self.column_names.index(colname) for colname in return_columns ] # if where_condition is None, return all rows # if not, return the rows with values where condition is met for value if where_condition is not None: column_name, operator, value = self._parse_condition( where_condition) column = self.columns[self.column_names.index(column_name)] rows = [ ind for ind, x in enumerate(column) if get_op(operator, x, value) ] else: rows = [i for i in range(len(self.columns[0]))] # copy the old dict, but only the rows and columns of data with index in rows/columns (the indexes that we want to be returned) dict = {(key): ([[self.data[i][j] for j in return_cols] for i in rows] if key == "data" else value) for key, value in self.__dict__.items()} dataList = dict.get('data') dictData = {} tempList = [] if (aggr_function == 'sum'): for row in dataList: key = tuple(row[:-1]) if (dictData.get(key, 'None') == 'None'): dictData[key] = row[-1] else: temp = dictData.get(key) temp += row[-1] dictData[key] = temp for k, v in dictData.items(): tempRow = list(k) tempRow.append(v) tempList.append(tempRow) dict['data'] = tempList elif (aggr_function == 'avg'): for row in dataList: key = tuple(row[:-1]) if (dictData.get(key, 'None') == 'None'): dictData[key] = [row[-1], 1] else: tempRow = dictData.get(key) tempRow[0] = tempRow[0] + row[-1] tempRow[1] = tempRow[1] + 1 dictData[key] = tempRow for k, v in dictData.items(): tempRow = list(k) val = v[0] / v[1] tempRow.append(val) tempList.append(tempRow) dict['data'] = tempList elif (aggr_function == 'count'): for row in dataList: key = tuple(row[:-1]) if (dictData.get(key, 'None') == 'None'): dictData[key] = 1 else: tempVal = dictData.get(key) tempVal += 1 dictData[key] = tempVal for k, v in dictData.items(): tempRow = list(k) tempRow.append(v) tempList.append(tempRow) dict['data'] = tempList elif (aggr_function == 'max'): for row in dataList: key = tuple(row[:-1]) if (dictData.get(key, 'None') == 'None'): dictData[key] = row[-1] else: tempVal = dictData.get(key) if (row[-1] > tempVal): tempVal = row[-1] dictData[key] = tempVal for k, v in dictData.items(): tempRow = list(k) tempRow.append(v) tempList.append(tempRow) dict['data'] = tempList elif (aggr_function == 'min'): for row in dataList: key = tuple(row[:-1]) if (dictData.get(key, 'None') == 'None'): dictData[key] = row[-1] else: tempVal = dictData.get(key) if (row[-1] < tempVal): tempVal = row[-1] dictData[key] = tempVal for k, v in dictData.items(): tempRow = list(k) tempRow.append(v) tempList.append(tempRow) dict['data'] = tempList if having_condition is not None: column_name, operator, value = self._parse_condition( having_condition) tempList = dict.get('data') colList = dict.get('column_names') for i in range(len(colList)): if colList[i] == column_name: index = return_cols.index(i) break tempList = [ row for row in tempList if get_op(operator, row[index], value) ] dict['data'] = tempList # we need to set the new column names/types and no of columns, since we might # only return some columns dict['column_names'] = [self.column_names[i] for i in return_cols] dict['column_types'] = [self.column_types[i] for i in return_cols] dict['_no_of_columns'] = len(return_cols) # order by the return table if specified if order_by is None: return Table(load=dict) else: return Table(load=dict).order_by(order_by, asc)
def aggregate(self, aggregation: dict, group_by): ''' apply aggregation to grouped data example: { 'building1': [capacity1] 'building2': [capacity2], 'building3': [capacity3, capacity4] } ''' grouping = True if group_by else False if not group_by: for aggregation_key, aggregation_value in aggregation.items(): group_by = {} aggregation_idx = self.column_names.index(aggregation_key) group_by[aggregation_key] = [ el[aggregation_idx] for el in self.data ] keys_to_delete = [] for aggregation_key, aggregation_value in aggregation.items(): # if aggregation function is avg or sum, column must be integer or float if aggregation_value[0] == 'avg' or aggregation_value[0] == 'sum': col_type = self.column_types[self.column_names.index( aggregation_key)] if col_type is not int and col_type is not float: raise Exception( f'{aggregation_value[0]} can be applied only to numeric columns' ) # save aggregation function to a lambda function, so it can be applied later if aggregation_value[0] == 'min': aggregation_function = lambda values: min(values) elif aggregation_value[0] == 'max': aggregation_function = lambda values: max(values) elif aggregation_value[0] == 'avg': aggregation_function = lambda values: sum(values) / len(values) elif aggregation_value[0] == 'count': aggregation_function = lambda values: len(values) elif aggregation_value[0] == 'sum': aggregation_function = lambda values: sum(values) else: raise Exception('Aggregation function was not found') # loop through grouped items and apply aggregation function for k, v in group_by.items(): group_by[k] = [aggregation_function(v)] # if len > 1, a having condition exists and data should be filtered if len(aggregation_value) > 1: column_name, operator, value = self._parse_condition( aggregation_value[1]) if not get_op(operator, group_by[k][0], value): # append keys of data that should be filtered out due to having condition keys_to_delete.append(k) # delete filtered out keys for k in keys_to_delete: del group_by[k] # add aggregation function as a column to be displayed in final print of data # example: sum( capacity) self.column_names = [ f'{aggregation_value[0]} ( {col} )' if col == aggregation_key else col for col in self.column_names ] # if grouping was applied, add grouped items to the final print of data if grouping: self.data = [v + [k] for k, v in group_by.items()] else: self.data = [v for k, v in group_by.items()]
def _full_outer_join(self, table_right: Table, condition): ''' Left outer join table (left) with a supplied table (right) where condition is met. ''' # get columns and operator column_name_left, operator, column_name_right = self._parse_condition( condition, join=True) # try to find both columns, if you fail raise error try: column_index_left = self.column_names.index(column_name_left) column_index_right = table_right.column_names.index( column_name_right) except: raise Exception(f'Columns dont exist in one or both tables.') # get the column names of both tables with the table name in front # ex. for left -> name becomes left_table_name_name etc left_names = [ f'{self._name}_{colname}' for colname in self.column_names ] right_names = [ f'{table_right._name}_{colname}' for colname in table_right.column_names ] # define the new tables name, its column names and types join_table_name = f'{self._name}_join_{table_right._name}' join_table_colnames = left_names + right_names join_table_coltypes = self.column_types + table_right.column_types join_table = Table(name=join_table_name, column_names=join_table_colnames, column_types=join_table_coltypes) # count the number of operations (<,> etc) no_of_ops = 0 # Simplify the full outer join as left outer and right outer for row_left in self.data: left_value = row_left[column_index_left] exist = False for row_right in table_right.data: right_value = row_right[column_index_right] no_of_ops += 1 if get_op(operator, left_value, right_value): exist = True # EQ_OP join_table._insert(row_left + row_right) if exist != True: none_values = [] for i in range(len(self.columns)): none_values.append(None) join_table._insert(row_left + none_values) for row_left2 in table_right.data: left_value2 = row_left2[column_index_left] exist = False for row_right2 in self.data: right_value2 = row_right2[column_index_right] no_of_ops += 1 if get_op(operator, left_value2, right_value2): exist = True #EQ_OP join_table._insert(row_right2 + row_left2) if exist != True: none_values = [] for i in range(len(self.columns)): none_values.append(None) join_table._insert(none_values + row_left2) print(f'## Select ops no. -> {no_of_ops}') print(f'# Left table size -> {len(self.data)}') print(f'# Right table size -> {len(table_right.data)}') return join_table