Example #1
0
    def xtest_load_with_insert(self):
        "ensures data tables are loading correctly by loading these flat files"
        build_flat_files.build_flat_files(self.temp_dir, self.out_dir)
        #loads the data tables (INSERT method)
        files = os.listdir(self.out_dir)
        files.sort()
        first_file = files[0]
        fn = os.path.join(self.out_dir,first_file) 
        t_name = utils_and_settings.get_table_name_from_fn(first_file)
        table=build_tables.load_with_insert(db=DB, t_name=t_name, file=fn, drop_old=True)

        #grab the data and ensure it's right
        curs.execute('END')
        curs.execute("SELECT * FROM {}".format(t_name))
        actual=curs.fetchall()
        
        target=[(1.0, 1.0, 10011.0, 20011.0),
                (1.0, 2.0, 10012.0, 20012.0),
                (1.0, 3.0, 10013.0, 20013.0),
                (1.0, 4.0, 10014.0, 20014.0),
                (2.0, 1.0, 10021.0, 20021.0),
                (2.0, 2.0, 10022.0, 20022.0),
                (2.0, 3.0, 10023.0, 20023.0),
                (2.0, 4.0, 10024.0, 20024.0),
                (3.0, 1.0, 10031.0, 20031.0),
                (3.0, 2.0, 10033.0, 20033.0),
                (3.0, 3.0, 10033.0, 20033.0),
                (3.0, 4.0, 10034.0, 20034.0),
                (4.0, 1.0, 10041.0, 20041.0),
                (4.0, 2.0, 10034.0, 20034.0),
                (4.0, 3.0, 10043.0, 20043.0),
                (4.0, 4.0, 10044.0, 20044.0)]
        
        for t, a in zip(target, actual):
            self.assertEqual(t, a, 'load_with_insert failed'  )            
Example #2
0
def build_tables(db = DB, pdir=PARENT_DIR, drop_old=True, data_dir=None):
    """Builds database tables from flat files produced by build_flat_files.py.
       Assumes that column names are in a header row, in a specific format:
       #  <col name>|<col name>| (etc)
       
       Also assumes the db table is all integers.
       
       First, it attempts to use COPY to create the tables and load the data.
       This may fail with large files, depending on memory available.  If this 
       happens, it reverts to using INSERT statements.
    """
    conn = psycopg2.connect(database = db,
                            user=login_info['user'],
                            password=login_info['password']
                            )
    curs = conn.cursor()    
  
    #if the database is not available, try to create it
    try:
        curs.execute('END')
        curs.execute('CREATE DATABASE {}'.format(db))
        logger.debug('creating new database')
    except psycopg2.ProgrammingError:      
        pass  #datbase already exists

    #With partitioned files, make sure that the table is only deleted once
    #  We'll use this to remember which have been DROPped
    safe_tables=set()
    
    #find each file in the data dir, create a db table, and load the data
    for root, dirs, files in os.walk(pdir):
        for f in files:
            
            logging.info('building table for file {}'.format(f))
            #the table name comes from the file name
            t_name = utils_and_settings.get_table_name_from_fn(f)
            
            #DROPs existing table. 
            if drop_old and not t_name in safe_tables:
                logger.debug('dropping table {}'.format(t_name))
                curs.execute('END')
                sql = 'DROP TABLE IF EXISTS {}'.format(t_name)
                curs.execute(sql)
                logger.debug('dropping table {}'.format(t_name))
                safe_tables.add(t_name)
                

            logger.debug('creating new table {}'.format(t_name))

            #get the header contains basis for col names. Example: # origin|dest|col1name|col2
            file=os.path.join(root, f)
            with open(file, 'r') as fil:
                header = fil.readline() 
                cols =header.replace('#','').strip().split('|')
                db_cols=[]
                for c in cols:
                    db_cols.append(c)                 
                    
                #create a table based on the columns (assume all fields float)
                sql='CREATE TABLE IF NOT EXISTS {} (\n'.format(t_name)
                for col in db_cols:
                    sql+='\t {} float,\n'.format(col)
                #for col in db_cols[:2]:   #origin, destination
                    #sql+='\t {} integer,\n'.format(col)
                #for col in db_cols[2:]:
                    #if data_dir in utils_and_settings.integer_dirs:
                        #sql+='\t {} integer,\n'.format(col)       
                    #else:
                        #sql+='\t {} float,\n'.format(col) 
                sql = sql[:-2] + ');' #
                curs.execute(sql)
                conn.commit()  
                logger.info('Created table {}'.format(t_name))
                logger.debug('Columns: {}'.format(db_cols))
                
                #try to load the file with COPY
                logger.debug('Loading flat file {} to the database with COPY'.format(f))
                fil.seek(0)
                cols = '(' + ', '.join(db_cols) + ')'
                sql="COPY {} {} FROM STDIN WITH CSV HEADER DELIMITER AS ','".format(t_name, cols) 
                try:
                    curs.copy_expert(sql=sql, file=fil)    
                    conn.commit()
                    logger.debug('success.')
                except: 
                    #failed - probably due to memory issues; do it the slow way with INSERTs
                    logger.warning('Nope.  COPY failed for file {} trying to INSERT it.'.format(f))                    
                    load_with_insert(db=db, t_name=t_name, file=file, drop_old=drop_old)