def verify_data( data, columns, hierarchy_indexes, start_ind ): ''' Detect eventual errors in the given file (parts inconsistent with columns and hierarchy). Return found errors in the list. start_ind - index of the first row of data ''' row_types = get_row_types( columns, hierarchy_indexes ) expected_len = len( row_types ) errors = [] hierarchies = {} log.description('Verifying data') for (i, row) in enumerate(data, start_ind): # TODO log it per 1% if i % 1000 == 0: log.description(i) if len( row ) != expected_len: errors.append( bad_len( i, len(row), expected_len ) ) if not are_fields_correct( row_types, row ): errors.append( bad_fields( i, row_types, row ) ) if is_row_hierarchy_empty( row, hierarchy_indexes ): errors.append( empty_hierarchy( i ) ) if errors: log.error('%s error(s) found' % len(errors)) else: log.description('Finished with no errors') return errors
def check_db_counters( self, init_endpoint_id, init_dbtree_id, init_data_id ): '''Check consistency of db counters and data in the database(if no data has higher counter than db counter. If such a situation happens, ask if it should be removed.''' # Check db tree nodes if self.db.get_higher_dbtree( init_dbtree_id ) != []: log.warn('Found wrong dbtree nodes, higher than %d' % init_dbtree_id) log.question('Do you want to remove them? (Y/N)') dec = raw_input('Your decision: ') if dec.lower() == 'y': self.db.remove_higher_dbtree( init_dbtree_id ) log.description('Removed wrong dbtree nodes') else: log.description('Dbtree correct') # Check hierarchy endpoint = 'data_' + str( init_endpoint_id ) if self.db.get_higher_hierarchy( endpoint ) != []: print 'Found wrong hierarchy columns, higher than %d' % init_endpoint_id print 'Do you want to remove them? (Y/N)' dec = raw_input('Your decision: ') if dec.lower() == 'y': self.db.remove_higher_hierarchy( endpoint ) print 'Removed wrong hierarchy columns' else: print 'Hierarchy correct' if self.db.get_higher_columns( endpoint ): print 'Found wrong columns, higher than %d' % init_endpoint_id print 'Do you want to remove them? (Y/N)' dec = raw_input('Your decision: ') if dec.lower() == 'y': self.db.remove_higher_columns( endpoint ) print 'Removed wrong columns' else: print 'Columns correct' # Check relations in ptree if self.db.get_higher_ptree( init_data_id ) != []: print 'Found wrong ptree nodes, higher than %d' % init_data_id print 'Do you want to remove them? (Y/N)' dec = raw_input('Your_decision: ') if dec.lower() == 'y': self.db.remove_higher_ptree( init_data_id ) print 'Removed wrong ptree nodes' else: print 'Ptree correct' # Remove tables with incorrect endpoints data tables_names = self.db.get_higher_datatables( init_endpoint_id ) if tables_names != []: print 'Found too many tables, higher than %d' % init_endpoint_id print 'Do you want to remove them? (Y/N)' dec = raw_input('Your decision: ') if dec.lower() == 'y': self.db.drop_higher_datatables( init_endpoint_id ) print 'Removed wrong data tables:' for tname in tables_names: print 'Removed table', tname else: print 'Data tables correct' # Check user uploaded collections users = self.db.get_non_admin_users() for user in users: if self.db.has_old_collections( user, init_dbtree_id ): print 'Found old collections from user %s' % user print 'Do you want to remove them? (Y/N)' dec = raw_input('Your decision: ') if dec.lower() == 'y': self.db.remove_old_collections( user, init_dbtree_id ) else: print 'User %s correct' % user
def upload(self, has_header=True, visible=True, restore=False): '''Main method of Uploader. Checks db counters, if any inconsistency is found, then ask if it should be removed. After that, checks data that is about to be uploaded. After this attempts to upload data. If any error occurs during that process, then removes uploaded data to that moment. Returns tuple containg boolean value that tells if it succeeded and name of the new endpoint. There are 4 optional parameters: has_header - if data file comes with header, visible - if endpoint should be visible after upload, restore - if state of db should be restored to the state pointed in debug_restore() method. Use with CAUTION! ''' # restore db state to a state before a recent data insertion if restore: self.debug_restore() # Check db counters init_endpoint_id = self.db.get_max_endpoint() init_dbtree_id = self.db.get_max_dbtree_id() init_data_id = self.db.get_max_data_id() # TODO move it to db module. data from db module should come correct! log.section('DB counters correctness') self.check_db_counters(init_endpoint_id, init_dbtree_id, init_data_id) log.end_section() # TODO move it to Meta class constructor! # Check if parents, columns and hierarchy from meta is correct log.section('Metadata correctness') try: log.description('Verifying metadata') self.check_correctness() except UploadDataException as e: log.error(e.get_error()) return ( False, e.get_error() ) log.end_section() # Check data, if any error is in data, stop processing and return list with errors log.section('Data correctness') errors = self.find_errors(has_header) if errors: return (False, errors) log.end_section() endpoint = None log.section('Data insertion') if self.debug: endpoint = self.insert_data_into_db(has_header, visible) else: try: endpoint = self.insert_data_into_db( has_header, visible ) except UploadDataException as e: log.error('Failed.') log.error(e) log.end_section() # cleanup after unseccessful upload self.remove_uploaded( init_endpoint_id, init_dbtree_id, init_data_id ) exit(0) log.description('Done!') log.end_section() return (True, endpoint)
def upload_data( self, endpoint, has_header=True, sum_up=True): '''Remove table for endpoint = given endpoint(if exists) and create a new one for new data. Create IdMap to track parent-child relations between nodes. If has_header = True, then omit the first line. Transform rows from original data to rows without hierarchy, and create hierarchy rows. Return max id of nodes from the collection. ''' def db_type( col_type, col_format ): if col_type == 'number': return 'float' if '.' in col_format else 'int' else: return col_type def type_fun( col_type, col_format ): return int if db_type( col_type, col_format ) == 'int' else float # UnicodeReader bulk = self.get_data(has_header) # Create and remove table self.db.remove_table( endpoint ) columns = [(t['key'], db_type(t['type'], t['format'])) for t in self.meta.get_columns()] self.db.create_table( endpoint, columns ) summable_cols = [] for (i, col) in enumerate( self.meta.get_columns() ): if col['type'] == 'number': summable_cols.append( (i+5, type_fun(col['type'], col['format'])) ) start_id = self.db.get_max_data_id() id_map = IdMap( start_id ) batch_size = self.count_batch_size() print 'BATCH_SIZE = ', batch_size # Process all rows # rows to be uploaded in one batch batch_rows = [] # rows that are actually processed, they need to be remembered, # because numeric fields should be summed from many leaves proc_rows = [] # hierarchy ptree_hier = [] # ptree rows to be uploaded in one batch batch_ptree_rows = [] # list representing values in hierarchy fields old_hierarchy_in_row = [] total_row = self.create_total_row( None ) for i, row in enumerate( bulk ): if i % 1000 == 0: log.description(i) # retrieve hierarchy from the row hierarchy_in_row = self.get_hierarchy_cols( row ) # remove empty fields from hierarchy columns while len( hierarchy_in_row ) > 0 and hierarchy_in_row[-1][0] == '': hierarchy_in_row.pop() common_level = self.hierarchy_common_level( hierarchy_in_row, old_hierarchy_in_row ) # Replace empty numeric cells with Nones (NULL in Postgres) hier_num = len(hierarchy_in_row) hier_cells = row[:hier_num] data_cells = row[hier_num:] row = hier_cells + [None if c[1] != 'string' and e == '' else e for e, c in zip(data_cells, columns)] # Transform rows to non hierarchical form new_rows = self.add_rows(id_map, common_level, hierarchy_in_row, row) ptree_hier, new_ptree_rows = self.create_ptree_rows( common_level, len( hierarchy_in_row ), new_rows, ptree_hier ) leaf_row = new_rows[-1] # remove rows that are not needed to sum values anymore # (all their children were added) and if there is top level # row in them, then add values from it to total row if i > 0: if common_level == 0: self.sum_values( total_row, proc_rows[0], summable_cols ) batch_rows += proc_rows[common_level:] proc_rows = proc_rows[:common_level] proc_rows += new_rows # sum from last but one row using values from leaf row for i in range( len(proc_rows) - 2, -1, -1): self.sum_values( proc_rows[i], leaf_row, summable_cols ) old_hierarchy_in_row = hierarchy_in_row batch_ptree_rows += new_ptree_rows if len( batch_rows ) > batch_size: self.db.insert_data( batch_rows, endpoint ) batch_rows = [] self.db.insert_ptree_data( batch_ptree_rows ) batch_ptree_rows = [] batch_rows += proc_rows # add values from the last top row to total row self.sum_values( total_row, proc_rows[-1], summable_cols ) # TODO: changed # TODO: get rid of magic numbers total_row_id = id_map.add_id( 0, 1 )[0] total_row[0] = total_row_id #total_row = self.create_total_row( total_row_id ) batch_rows.append( total_row ) batch_ptree_rows.append( (total_row_id, []) ) self.db.insert_data( batch_rows, endpoint ) self.db.insert_ptree_data( batch_ptree_rows ) return id_map.get_last_id()
def insert_data_into_db( self, has_header, visible ): '''Inserts node (or nodes if new parents) into dbtree, uploads new hierarchy and columns, then uploads data, sums columns of higher level nodes and updates db data counter. Updates ptree. Returns new endpoint's name.''' log.description('Uploading...') endpoint, new_dbtree_ids = self.update_dbtree(visible) log.description('Dbtree uploaded') self.update_hierarchy(endpoint) log.description('Hierarchy uploaded') self.update_columns( endpoint ) log.description('Columns uploaded') # Return id of the last uploaded row last_id = self.upload_data(endpoint, has_header=has_header) log.description('Data uploaded') log.description('Columns summed up, ptree uploaded') # Add information about uploading collection to the user (if not admin) if not self.db.is_admin( self.meta.get_user() ): self.db.add_user_collections( self.meta.get_user(), new_dbtree_ids ) self.db.set_max_data_id( last_id ) log.description('Ptree uploaded') return endpoint