def from_flac_to_tfrecords(train_r=0.8, valid_test_r=0.2):
	# Extract the information about this subset (speakers, chapters)
	# Dictionary with the following shape: 
	# {speaker_key: {chapters: [...], sex:'M/F', ... } }
	folder = config.data_root+'/'+config.data_subset
	speakers_info = data_tools.read_metadata(config.data_subset)
	keys_to_index = {}
	for i, key in enumerate(speakers_info.keys()):
		keys_to_index[key] = i

	sex = ['M' for i in range(len(speakers_info))]
	for k, v in speakers_info.items():
		i = keys_to_index[k]
		sex[i] = v['sex']

	np.save('genders_index.arr', sex)
	# exit()

	allfiles = np.array([os.path.join(r,f) for r,dirs,files in os.walk(folder) for f in files if f.endswith(".flac")])
	L = len(allfiles)
	np.random.shuffle(allfiles)
	train = allfiles[:int(L*train_r)]
	valid = allfiles[int(L*train_r):int(L*(train_r+valid_test_r/2))]
	test = allfiles[int(L*(train_r+valid_test_r/2)):]
	
	print len(train), len(valid), len(test)

	for group_name, data_split in [("train", train),("test", test), ("valid", valid)]:

		for s in ['M', 'F']:

			writer = tf.python_io.TFRecordWriter(group_name + '_' + s +'.tfrecords')

			for file in data_split:

				splits = file.split('/')
				key = splits[-3]
				sex = speakers_info[key]['sex']

				if sex == s:

					raw_audio, sr = load(file, sr=16000)
					raw_audio = resample(raw_audio, sr, config.fs)
					raw_audio = raw_audio.astype(np.float32).tostring()

					feature = tf.train.Example(features=tf.train.Features(
						feature = { 'audio' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[raw_audio])),
									'key' : tf.train.Feature(int64_list=tf.train.Int64List(value=[keys_to_index[key]]))
					}))
					print group_name, s, key, keys_to_index[key]
					writer.write(feature.SerializeToString())

			writer.close()
Esempio n. 2
0
	def create_h5_dataset(self, output_fn, subset=config.data_subset, data_root=config.data_root):
		"""
		Create a H5 file from the LibriSpeech dataset and the subset given:

		Inputs:
			output_fn: filename for the created file
			subset: LibriSpeech subset : 'dev-clean' , ...
			data_root: LibriSpeech folder path

		"""

		# Extract the information about this subset (speakers, chapters)
		# Dictionary with the following shape: 
		# {speaker_key: {chapters: [...], sex:'M/F', ... } }
		speakers_info = data_tools.read_metadata(subset)

		with h5py.File(output_fn,'w') as data_file:

			for (key, elements) in speakers_info.items():
				if key not in data_file:
					# Create an H5 Group for each key/speaker
					data_file.create_group(key)

				# Current speaker folder path
				folder = data_root+'/'+subset+'/'+key

				print_progress(0, len(elements['chapters']), prefix = 'Speaker '+key+' :', suffix = 'Complete')

				# For all the chapters read by this speaker
				for i, chapter in enumerate(elements['chapters']): 
					# Find all .flac audio
					for root, dirs, files in os.walk(folder+'/'+chapter): 
						for file in files:
							if file.endswith(".flac"):

								path = os.path.join(root,file)
								raw_audio, samplerate = sf.read(path)

								# Generate the spectrogram for the current audio file
								_, _, spec = create_spectrogram(raw_audio, samplerate)

								data_file[key].create_dataset(file,
									data=spec.T.astype(np.complex64),
									compression="gzip",
									dtype=np.complex64,
									compression_opts=0)

					print_progress(i + 1, len(elements['chapters']), prefix = 'Speaker '+key+' :', suffix = 'Complete')


		print 'Dataset for the subset: ' + subset + ' has been built'
    def create_raw_audio_dataset(output_fn,
                                 subset=config.data_subset,
                                 data_root=config.data_root):
        """
		Create a H5 file from the LibriSpeech dataset and the subset given:

		Inputs:
			output_fn: filename for the created file
			subset: LibriSpeech subset : 'dev-clean' , ...
			data_root: LibriSpeech folder path

		"""
        from librosa.core import resample, load

        # Extract the information about this subset (speakers, chapters)
        # Dictionary with the following shape:
        # {speaker_key: {chapters: [...], sex:'M/F', ... } }
        speakers_info = data_tools.read_metadata(subset)
        with h5py.File(output_fn, 'w') as data_file:

            for key, elements in tqdm(speakers_info.items(),
                                      total=len(speakers_info),
                                      desc='Speakers'):
                if key not in data_file:
                    # Create an H5 Group for each key/speaker
                    data_file.create_group(key)

                # Current speaker folder path
                folder = data_root + '/' + subset + '/' + key
                # For all the chapters read by this speaker
                for i, chapter in enumerate(
                        tqdm(elements['chapters'], desc='Chapters')):
                    # Find all .flac audio
                    for root, dirs, files in os.walk(folder + '/' + chapter):
                        for file in tqdm(files, desc='Files'):
                            if file.endswith(".flac"):
                                path = os.path.join(root, file)
                                raw_audio, sr = load(path, sr=16000)
                                raw_audio = resample(raw_audio, sr, config.fs)
                                data_file[key].create_dataset(
                                    file,
                                    shape=raw_audio.shape,
                                    data=raw_audio,
                                    chunks=raw_audio.shape,
                                    maxshape=raw_audio.shape,
                                    compression="gzip",
                                    compression_opts=9)

        print 'Dataset for the subset: ' + subset + ' has been built'
	def __init__(self, ratio=[0.90, 0.05, 0.05], **kwargs):
		"""
		Inputs:
			ratio: ratio for train / valid / test set
			kwargs: Dataset parameters
		"""

		np.random.seed(config.seed)

		self.nb_speakers = kwargs['nb_speakers']
		self.sex = kwargs['sex']
		self.batch_size = kwargs['batch_size']
		self.chunk_size = kwargs['chunk_size']
		self.no_random_picking = kwargs['no_random_picking']

		# Flags for Training/Validation/Testing sets
		self.TRAIN = 0
		self.VALID = 1
		self.TEST = 2

		# TODO 
		metadata = data_tools.read_metadata()

		if self.sex != ['M', 'F'] and self.sex != ['F', 'M'] and self.sex != ['M'] and self.sex != ['F']:
			raise Exception('Sex must be ["M","F"] |  ["F","M"] | ["M"] | [F"]')

		# Create a key to speaker index dictionnary
		# And count the numbers of speakers
		self.key_to_index = {}
		self.sex_to_keys = {}
		j = 0

		if 'M' in self.sex:
			M = data_tools.males_keys(metadata)
			self.sex_to_keys['M'] = M
			for k in M:
				self.key_to_index[k] = j
				j += 1 
		if 'F' in self.sex:
			F = data_tools.females_keys(metadata)
			self.sex_to_keys['F'] = F
			for k in F:
				self.key_to_index[k] = j
				j += 1

		self.tot_speakers = j

		self.file = h5py.File(kwargs['dataset'], 'r')


		# Define all the items related to each key/speaker
		self.total_items = []

		for key in self.key_to_index.keys():
			for val in self.file[key]:
				# Get one file related to a speaker and check how many chunks can be obtained
				# with the current chunk size
				chunks = self.file['/'.join([key,val])].shape[0]//self.chunk_size
				# Add each possible chunks in the items with the following form:
				# 'key/file/#chunk'
				self.total_items += ['/'.join([key,val,str(i)]) for i in range(chunks)]

		np.random.shuffle(self.total_items)
		self.total_items = self.total_items

		L = len(self.total_items)
		# Shuffle all the items

		# Training / Valid / Test Separation
		train = self.create_tree(self.total_items[:int(L*ratio[0])])
		valid = self.create_tree(self.total_items[int(L*ratio[0]):int(L*(ratio[0]+ratio[1]))])
		test = self.create_tree(self.total_items[int(L*(ratio[0]+ratio[1])):])
		
		self.train = TreeIterator(train, self)
		self.valid = TreeIterator(valid, self)
		self.test = TreeIterator(test, self)
Esempio n. 5
0
			self.dico[int(item)] = i

	def __iter__(self):
		return self

	def get_labels(self):
		return self.dico

from data_tools import read_metadata, males_keys, females_keys
if __name__ == "__main__":

	###
	### TEST
	###

	H5_dic = read_metadata()
	print H5_dic
	chunk_size = 512*100

	males = H5PY_RW('test_raw.h5py', subset = males_keys(H5_dic))
	fem = H5PY_RW('test_raw.h5py', subset = females_keys(H5_dic))

	print 'Data with', len(H5_dic), 'male and female speakers'
	print males.length(), 'elements'
	print fem.length(), 'elements'

	mixed_data = Mixer([males, fem], chunk_size= chunk_size, with_mask=False, with_inputs=True, shuffling=True)

	batch_size = 128

	mixed_data.adjust_split_size_to_batchsize(batch_size)