def unpack(self): """ Unpack the downloads into the root directory for this map """ global VERBOSE if VERBOSE: print "Unpacking data files to disk..." # need to check what file type we've got now... file_types = { '.csv': lambda x: None, # don't need to unpack uncompressed files '.sql': lambda x: None, '.xls': lambda x: None, '.xlsx': lambda x: None, '.html': lambda x: None, '.pdf': lambda x: None, '.tar': unpack_tar, '.gz': unpack_gzip, '.tgz': unpack_tar, '.tar.gz': unpack_tar, '.zip': unpack_zip, } # get all files in working directory of this map files = os.listdir(TMP_DIRECTORY + '%s/' % self.__name__) # iterate through files for f in files: file_name = os.path.basename(f) # separate out the file extension root, ext = guess_extension(file_name) # using file type, extract this file! file_types[ext](os.path.basename(f))
def install(self): """ Does installation of the files into user's chosen database This is a primarily internal method, but if base it should just get called. NOTES: - Does installation have to assume that it can just install from each of the files available? Do we have to re-write the installer for something complex like the US Census? And is that an acceptable level of configuration for a Map? TODO: - Need to fix how headers work -- can specify whether headers are present, whether all data should be installed into the same database? """ # check if we need a separate db for each url or whether one is enough # one is enough if specified here if self.db_name: db_name = self.db_name self.db.create_db(self.__name__) # for every file url #files = os.listdir(TMP_DIRECTORY + '%s/' % self.__name__) for k, v in self.data.iteritems(): root, ext = guess_extension(v['url']) file_name = os.path.basename(root + ext) # If we don't have a db name, we should find it in the URLs if self.db_name: db_name = self.db_name else: db_name = v['database'] self.db.create_db(db_name=db_name) if ext == ".sql": # if we have a SQL file, we should run that # TODO: THIS DOESN'T ACTUALLY WORK, BUT WE NEED TO DO SOMETHING LIKE THIS self.db.query(f) elif ext in (".csv", ".pdf", ".xls", ".xlsx", ".html"): # create messy2sql instance m2s = Messy2SQL(file_name, DATABASES['sql']['type']) # if we have PDF, HTML, CSV, or Excel files, we should use messy2sql # get a table query, run it! fh = open((TMP_DIRECTORY + self.__name__ + '/' + file_name), 'rb') # use messytables to build a MessyTables RowSet with file type rows = { '.csv': CSVTableSet(fh).tables[0], # '.pdf': PDFTableSet(file_name), # '.xlsx': XLSTableSet(file_name), # '.xls': XLSTableSet(file_name), # '.html': HTMLTableSet(file_name), }[ext] # use the rowset here to create a sql table query and execute self.db.create_table(query = m2s.create_sql_table(rows), db_name=db_name) # get insert statements self.db.insert(query = m2s.create_sql_insert(rows), db_name=db_name, table_name=root) else: pass