コード例 #1
0
ファイル: process.py プロジェクト: samrund/actual_hdfs
	def dump_data(self, hdf5_folder, filename, timezone, subjects={}):
		mprintln('Dumping the data from: ' + filename)
		out_h5ls = check_output([hdf5_folder + 'h5ls', filename + '/subjects'])

		# cache_filenmae = 'obj.save'
		# if os.path.isfile(cache_filenmae):
		# 	f = file(cache_filenmae, 'rb')
		# 	loaded_data = cPickle.load(f)
		# 	f.close()
		# 	return loaded_data

		for n in subjects:
			print n + " - " + str(len(subjects[n]))

		for n in out_h5ls.split('\n'):
			subject = n.split(' ')[0]
			if subject:
				out_h5dump = check_output([hdf5_folder + 'h5dump', '--group=subjects/' + subject, filename])

				if subject not in subjects:
					subjects[subject] = []

				subjects[subject] += self.extract_data(out_h5dump, filename)

				# f = file(cache_filenmae, 'wb')
				# cPickle.dump(subjects, f, protocol=cPickle.HIGHEST_PROTOCOL)
				# f.close()

		for n in subjects:
			print n + " - " + str(len(subjects[n]))

		return subjects
コード例 #2
0
ファイル: process.py プロジェクト: samrund/actual_hdfs
	def save_to_csv(self, fname, subjects):
		mprintln('Saving the data to: ' + fname)

		with open(fname, 'wb') as csvfile:
			writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
			for subject in subjects:
				for record in subject:
					writer.writerow(record)
コード例 #3
0
ファイル: process.py プロジェクト: samrund/actual_hdfs
	def process_all(self, hdf5_folder, columns, antennas, timezone, input_files, bin_time):
		mprintln('Processing the data...')
		mprint(str(len(input_files)) + ' file(s) found')

		result = {}

		subjects_data = {}
		for n in range(len(input_files)):
			subjects_data = self.dump_data(hdf5_folder, input_files[n], subjects_data)

			for subject in subjects_data:
				while True:
					binned_data = self.get_binned_record(subjects_data[subject], bin_time)
					if binned_data:
						# remove the records from the list
						subjects_data[subject] = subjects_data[subject][len(binned_data):]

						# add binned data to the result list
						result = self.add_record_to_dictionary(result, timezone, subject, binned_data, columns, antennas)
					else:
						break

		return result