Beispiel #1
0
	def setUp(self):
		""" Create a fake dataset to use to insert """

		global PROJECT_ROOT

		self.db = DBConnect()
		self.db.create_db("test_db")

		self.test_map = Map(db_name="test_db")

		self.test_map.homepage = "http://www.mytestingdata.com/"
		self.test_map.description = "The testing data for my Maps"
		self.test_map.data = {
								'testdata': {
									'url': ("file:///" + PROJECT_ROOT + "tests/testdata.csv"),
									'mirror': "",
									'sha1': "",
									'dictionary': "",
								},
								'testdata2': {
									'url': ("file:///" + PROJECT_ROOT + "tests/testdata.csv"),
								},
							}
		self.test_map.db_type = 'sql'
		self.test_map.db_name = 'test_db'
		self.test_map.__name__ = "TestMap"
Beispiel #2
0
	def __init__(self, db_name=None):

		# specific database name specified
		if db_name:
			self.db_name = db_name

					# open MySQL connection
		self.db = DBConnect()
Beispiel #3
0
class Map:
	# Methods internal to Maps class

	# These are here rather than in __init__ so that subclasses can simply define each of these
	# within the subclass maps
	
	# URL for homepage of dataset (e.g. http://census.gov/)
	homepage = ''

	description = ''

	# Specific URLs for the dataset downloads
	# needs to be a dictionary ala
	# data = {'census1': {url: 'http://www.census.gov/census1.zip', mirror: '', sha1: '', dictionary: ''}}
	# TODO: data is downloaded and added to databases in the order listed here. if one set depends on installation of another, put the
	# dependent ones later
	data = {}

	# type of database to install e.g. 'docstore','sql','keyvalue'
	# see conf/settings.py for available databases
	db_type = 'sql'

	db_name = ''


	def __init__(self, db_name=None):

		# specific database name specified
		if db_name:
			self.db_name = db_name

					# open MySQL connection
		self.db = DBConnect()


	def __is_installed(self, db_name):
		"""
		Utility to check if Map is already installed in base database using available
		databases and db prefix names
		"""

		# Create database if it isn't there already
		# Need to check that this returns TRUE
		return self.db.query(("SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = '%s';" % self.db_name))

	def setup(self):
		""" Need to prep by creating a folder and changing the system into that directory """
		global VERBOSE, TMP_DIRECTORY

		if VERBOSE:
			print "Initializing temporary working directory..."

		# make directory for this in temp dir + name_of_map
		# switch to this directory
		os.chdir(TMP_DIRECTORY)

		try:
			os.mkdir(self.__name__)
		except OSError:
			print "Directory already exists for this file..."

		os.chdir((TMP_DIRECTORY + '%s' % self.__name__))

		# in case of sql, create a database here
		# commit query

	def download(self):
		"""
		Using data dictionary of urls, grab the files and display a nice progress bar while doing it
		"""
		global VERBOSE, TMP_DIRECTORY
		if VERBOSE:
			print "Downloading data files..."

		os.chdir(TMP_DIRECTORY + "%s" % self.__name__)

		# need an iterator to download what is either a single page or a load of files, but that should get specified.
		# this should be the easiest one to write
		for k, v in self.data.iteritems():
			download_file(v['url'], with_progress_bar=True)

		# use a messy2sql because we'll need it
		# eventually this can be part of an IF import -- we only need it if we are doing SQL
		#m2s = Messy2SQL()
		

	def unpack(self):
		"""
		Unpack the downloads into the root directory for this map
		"""
		global VERBOSE

		if VERBOSE:
			print "Unpacking data files to disk..."

		# need to check what file type we've got now...
		file_types = {
			'.csv': lambda x: None,  # don't need to unpack uncompressed files
			'.sql': lambda x: None,
			'.xls': lambda x: None,
			'.xlsx': lambda x: None,
			'.html': lambda x: None,
			'.pdf': lambda x: None,
			'.tar': unpack_tar,
			'.gz': unpack_gzip,
			'.tgz': unpack_tar,
			'.tar.gz': unpack_tar,
			'.zip': unpack_zip,
		}

		# get all files in working directory of this map
		files = os.listdir(TMP_DIRECTORY + '%s/' % self.__name__)

		# iterate through files
		for f in files:
			file_name = os.path.basename(f)

			# separate out the file extension
			root, ext = guess_extension(file_name)

			# using file type, extract this file!
			file_types[ext](os.path.basename(f))


	def install(self):
		"""
		Does installation of the files into user's chosen database

		This is a primarily internal method, but if base it should just get called.

		NOTES:
			- Does installation have to assume that it can just install from each of the files available? Do we
			  have to re-write the installer for something complex like the US Census? And is that an acceptable level
			  of configuration for a Map?

		TODO:
			- Need to fix how headers work -- can specify whether headers are present, whether all data should be installed
			  into the same database?
		"""

		# check if we need a separate db for each url or whether one is enough
		# one is enough if specified here
		if self.db_name:
			db_name = self.db_name
			self.db.create_db(self.__name__)

		# for every file url
		#files = os.listdir(TMP_DIRECTORY + '%s/' % self.__name__)
		for k, v in self.data.iteritems():
			
			root, ext = guess_extension(v['url'])
			file_name = os.path.basename(root + ext)

			# If we don't have a db name, we should find it in the URLs
			if self.db_name:
				db_name = self.db_name
			else:
				db_name = v['database']
				self.db.create_db(db_name=db_name)
			
			if ext == ".sql":
				# if we have a SQL file, we should run that
				# TODO: THIS DOESN'T ACTUALLY WORK, BUT WE NEED TO DO SOMETHING LIKE THIS
				self.db.query(f)

			elif ext in (".csv", ".pdf", ".xls", ".xlsx", ".html"):	
				# create messy2sql instance
				m2s = Messy2SQL(file_name, DATABASES['sql']['type'])
				# if we have PDF, HTML, CSV, or Excel files, we should use messy2sql
				# get a table query, run it!


				fh = open((TMP_DIRECTORY + self.__name__ + '/' + file_name), 'rb')
				
				# use messytables to build a MessyTables RowSet with file type
				rows = {
					'.csv': CSVTableSet(fh).tables[0],
					# '.pdf': PDFTableSet(file_name),
					# '.xlsx': XLSTableSet(file_name),
					# '.xls': XLSTableSet(file_name),
					# '.html': HTMLTableSet(file_name),
				}[ext]

				# use the rowset here to create a sql table query and execute
				self.db.create_table(query = m2s.create_sql_table(rows), db_name=db_name)

				# get insert statements
				self.db.insert(query = m2s.create_sql_insert(rows), db_name=db_name, table_name=root)
			else:
				pass


	def cleanup(self):
		global VERBOSE

		if VERBOSE:
			print "Cleaning up folders and closing DB connections..."
		
		# need to delete all the files in tmp/thismap
		os.chdir('../')
		os.rmdir(self.__name__)

		# close DB connection
		cursor.close()
		cnx.close()
Beispiel #4
0
class TestSQLMap(unittest.TestCase):
	"""
	Unittest to try making a Map that is a SQL map.
	Test needs to setup with a .csv style dataset.
	"""
	def setUp(self):
		""" Create a fake dataset to use to insert """

		global PROJECT_ROOT

		self.db = DBConnect()
		self.db.create_db("test_db")

		self.test_map = Map(db_name="test_db")

		self.test_map.homepage = "http://www.mytestingdata.com/"
		self.test_map.description = "The testing data for my Maps"
		self.test_map.data = {
								'testdata': {
									'url': ("file:///" + PROJECT_ROOT + "tests/testdata.csv"),
									'mirror': "",
									'sha1': "",
									'dictionary': "",
								},
								'testdata2': {
									'url': ("file:///" + PROJECT_ROOT + "tests/testdata.csv"),
								},
							}
		self.test_map.db_type = 'sql'
		self.test_map.db_name = 'test_db'
		self.test_map.__name__ = "TestMap"


	def is_installed_test(self):
		""" Should pass if this is already installed in standard area """
		pass

	def test_setup(self):
		""" Should pass if directory is created in TEMP and if OS is pointed to that directory """
		self.test_map.setup()

		try:
			os.chdir(TMP_DIRECTORY + 'TestMap')
		except OSError:
			assert 0 == 1 # now we know this failed in the tests
			print "Could not find test data directory"

	def test_download(self):
		""" Should pass if download completes and dataset resides in tmp/ """
		self.test_map.download()

		self.assertEqual(os.path.exists(TMP_DIRECTORY + "TestMap/testdata.csv"), True)

	def test_unpack(self):
		""" Should pass if files exist on disk """
		self.test_map.unpack()

		self.assertEqual(os.path.exists(TMP_DIRECTORY + "TestMap/testdata.csv"), True)

	def test_install(self):
		""" Should pass if data gets inserted into SQL database """
		self.test_map.install()

		# self.db.query("SELECT * FROM ")

	def test_cleanup(self):
		""" Should pass if data created in download & unpack is deleted """
		pass

	def tearDown(self):
		""" Destroy fake dataset and database changes (if any) """