コード例 #1
0
ファイル: map.py プロジェクト: jimthedev/miner
	def unpack(self):
		"""
		Unpack the downloads into the root directory for this map
		"""
		global VERBOSE

		if VERBOSE:
			print "Unpacking data files to disk..."

		# need to check what file type we've got now...
		file_types = {
			'.csv': lambda x: None,  # don't need to unpack uncompressed files
			'.sql': lambda x: None,
			'.xls': lambda x: None,
			'.xlsx': lambda x: None,
			'.html': lambda x: None,
			'.pdf': lambda x: None,
			'.tar': unpack_tar,
			'.gz': unpack_gzip,
			'.tgz': unpack_tar,
			'.tar.gz': unpack_tar,
			'.zip': unpack_zip,
		}

		# get all files in working directory of this map
		files = os.listdir(TMP_DIRECTORY + '%s/' % self.__name__)

		# iterate through files
		for f in files:
			file_name = os.path.basename(f)

			# separate out the file extension
			root, ext = guess_extension(file_name)

			# using file type, extract this file!
			file_types[ext](os.path.basename(f))
コード例 #2
0
ファイル: map.py プロジェクト: jimthedev/miner
	def install(self):
		"""
		Does installation of the files into user's chosen database

		This is a primarily internal method, but if base it should just get called.

		NOTES:
			- Does installation have to assume that it can just install from each of the files available? Do we
			  have to re-write the installer for something complex like the US Census? And is that an acceptable level
			  of configuration for a Map?

		TODO:
			- Need to fix how headers work -- can specify whether headers are present, whether all data should be installed
			  into the same database?
		"""

		# check if we need a separate db for each url or whether one is enough
		# one is enough if specified here
		if self.db_name:
			db_name = self.db_name
			self.db.create_db(self.__name__)

		# for every file url
		#files = os.listdir(TMP_DIRECTORY + '%s/' % self.__name__)
		for k, v in self.data.iteritems():
			
			root, ext = guess_extension(v['url'])
			file_name = os.path.basename(root + ext)

			# If we don't have a db name, we should find it in the URLs
			if self.db_name:
				db_name = self.db_name
			else:
				db_name = v['database']
				self.db.create_db(db_name=db_name)
			
			if ext == ".sql":
				# if we have a SQL file, we should run that
				# TODO: THIS DOESN'T ACTUALLY WORK, BUT WE NEED TO DO SOMETHING LIKE THIS
				self.db.query(f)

			elif ext in (".csv", ".pdf", ".xls", ".xlsx", ".html"):	
				# create messy2sql instance
				m2s = Messy2SQL(file_name, DATABASES['sql']['type'])
				# if we have PDF, HTML, CSV, or Excel files, we should use messy2sql
				# get a table query, run it!


				fh = open((TMP_DIRECTORY + self.__name__ + '/' + file_name), 'rb')
				
				# use messytables to build a MessyTables RowSet with file type
				rows = {
					'.csv': CSVTableSet(fh).tables[0],
					# '.pdf': PDFTableSet(file_name),
					# '.xlsx': XLSTableSet(file_name),
					# '.xls': XLSTableSet(file_name),
					# '.html': HTMLTableSet(file_name),
				}[ext]

				# use the rowset here to create a sql table query and execute
				self.db.create_table(query = m2s.create_sql_table(rows), db_name=db_name)

				# get insert statements
				self.db.insert(query = m2s.create_sql_insert(rows), db_name=db_name, table_name=root)
			else:
				pass