Example #1
0
def find_table_data(data_dirs):
    "finds all the tables the db is supposed to have, along with"
    data_dirs = data_dirs
    table_data = {}  # keys: table names, values: col_data dicts
    # Go thru the data directories.  Build a mapping between them and the db tables
    for in_dir in data_dirs:  #'dirs' is the main dict "scenario level" directories
        for root, dirs, files in os.walk(in_dir["data"]):

            tnames = []  # just the table names
            tname_col_file = []  # tuples of table name, column name, source file name
            # each of these directories populates a db table

            for d in dirs:
                # the table is named after the dir
                raw_name = utils_and_settings.get_table_name_from_dir(os.path.join(root, d))
                t_name = raw_name.replace("-", "_")
                tnames.append(t_name)

                col_data = []

                # each column is named after a file
                for f in os.listdir(os.path.join(root, d)):
                    fn = os.path.join(root, d, f)
                    col_data.append(
                        {
                            "scenario_dir": in_dir,
                            "file": os.path.join(in_dir, root, d, f),
                            "col": utils_and_settings.get_column_name(os.path.join(root, d, f)),
                        }
                    )

                table_data[t_name] = col_data
    return table_data
Example #2
0
def build_flat_files(in_dir, out_dir, test_max_rows=None):
    """Given a directory, gathers all file contents into an array and saves it.
       The first two columns will have the origin (row) and destination (col)"""
    
    np.set_printoptions(precision=4)
    data_file_lengths=[]
    
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)  

    for root, dirs, files in os.walk(in_dir, followlinks=True):
        #each dir will contain files for a single array

        for d in dirs:
            logger.info('creating np array from {}'.format(d))
            ##determine np array type
            #if d in integer_dirs:
                #dtype = int
            #else:
                #dtype = float
            ##TODO: might want to build record table for arrays (mixed types)
            dtype=float
            #figure out which db table #this will be the last bit of the root + current           
            table_name=get_table_name_from_dir(os.path.join(root, d) )
            #what files?
            try:
                files=os.listdir(os.path.join(root,d))
            except:
                pass
            output_cols= len(files) + 2
            #columns_reported = ['origin', 'dest']
            #rows and cols for main array 
            if files:
                data = np.genfromtxt(os.path.join(root, d, files[0]), delimiter=',', dtype=dtype)
                input_cols=int(data[0][-1])
                input_rows=len(data)-1        #as the data would have it
                initial_rows = input_rows     #remember this for posterity
                rows=initial_rows             #this may change with large directories
                
            #can we make a big enough array (depends on the computer)?
            else:
                print("Sorry, I can't find any files in directory {}".format(os.path.join(root,d)))
                continue
            
            try:
                npa=None
                npa=np.zeros((input_rows*input_cols, output_cols), dtype=dtype)
            except:
                #nope.  Let's see what we *can* do.  First, provide some data to the user
                error_info=[]
                error_info.append('problem creating np array for {}'.format(os.path.join(root,d)))
                msg='input_rows: {}  input_cols: {}  output_cols: {}'
                error_info.append(msg.format(input_rows, input_cols, output_cols ))
                logger.info(error_info)
                
                #                
                rows = find_max_rows(input_cols, output_cols)
                logging.info('files in {} are too big for single load.  Doing it piecemeal'.format(d))

            #this is an override to facilitate testing - not used in practice
            if test_max_rows:
                rows = test_max_rows
            
            #we'll need to iterate this block as many times as required to get all the rows
            rows_so_far = 0              
            #a generator to calculate header and footer rows to skip
            windows= get_window(initial_rows, rows_per=rows, start=2)             
            
            
            while rows_so_far < initial_rows:           
                
                columns_reported = ['origin', 'dest']
                if rows_so_far>=1000:
                    a=1
                rows_needed = min(rows*input_cols, (initial_rows-rows_so_far)*input_cols)
                #print('needed {}'.format(rows_needed))
                npa=None
                npa=np.zeros((rows_needed, output_cols), dtype=dtype)   #all zero arr
                od = make_OD_array(rows, cols=input_cols, start_row=rows_so_far+1, max_row=initial_rows)

                #replace the first two cols with O D values
                npa[:,0]=od[0]
                npa[:,1]=od[1]              
                
                #the column for the data (first 2 are for O/D)
                data_col=2
                files.sort()
                
                this_window=next(windows)     #sets header and footer parts of file to skip
                skip_header = this_window['skip_header']
                skip_footer = this_window['skip_footer']+1
                ##TODO fix this 'last file indexing issue correctly
                if skip_footer==1:
                    skip_footer=0
                
                for f in files:
                    logger.debug('working on file: {}: beginning at line {}'.format(f, skip_header))
                    fn = os.path.join(root, d, f)
                    raw_data = np.genfromtxt(fn, delimiter=',', dtype=dtype, 
                                             skip_header=skip_header,
                                             skip_footer=skip_footer)
                    data = raw_data[:, 1:]          #removes row/col headers
                    data=data.ravel()            #turns it into a vector
                   #check if this is a null (all zeros) file - we can only do this on files ingested intact
                    if not data.sum() and rows==initial_rows:  
                        logger.warning('**** WARN:  Did not load {} - all 0s ***'.format(os.path.join(d, f)))
                        continue
                    
                    #check if this is of a different size than others encountered ((intact ingestion))
                    if rows==initial_rows: 
                        file_length = len(data) 
                        trial_data_lengths = data_file_lengths[:]
                        trial_data_lengths.append(file_length)
                        if data_file_lengths and len(set(trial_data_lengths)) > 1:
                            msg = '**** WARN: Did not load {} - expected length of {}, but is was {}.'
                            logger.warning(msg.format(os.path.join(d, f), data_file_lengths[0], file_length))
                            continue
                    
                    #load data into the next column of the np array (unless it's all zeros)
                    npa[:,data_col]=data[:]
                    columns_reported.append(get_column_name(f))
                    data_file_lengths.append(len(data))
                    
                    data_col+=1
    
                #save the output file 
                #data_fn=os.path.join(root, out_dir, table_name + "_data.csv")
                #save the data file, labeled with the 'rows_so_far'
                data_fn=os.path.join(root, out_dir, table_name + "_data" + str(rows_so_far) + ".csv")
                msg='attempting to save flat file {} for info in {}'
                logger.debug(msg.format(data_fn, os.path.join(root,d)))
                try:
                    np.savetxt(data_fn, npa, delimiter=',', header="|".join(columns_reported), fmt="%s")
                except:
                    msg='Could not save file {} in {}'
                    logger.warning(msg.format(data_fn, os.path.join(root,d)))
                logger.debug('success')
                rows_so_far += rows