def ingest_data(self, cache_dir, tf_dir, desired_frame_rate): """Reads the Jens data from mat files and ingest it to TFrecords. Args: cache_dir: Local copy of the original archive file from the Internet tf_dir: Folder where tfrecord files are written out desired_frame_rate: Desired frame rate after ingestion. """ mat_files_list = sorted(tf.io.gfile.glob(os.path.join(cache_dir, '*.mat'))) eeg_dir = '.' sound_dir = '.' make_if_not_exists(tf_dir) print('Ingesting %d files of Jens data.' % len(mat_files_list), file=regression_data_print) all_ingested_files = [] for sid, mat_file in enumerate(mat_files_list): print('Ingesting %s' % mat_file, file=regression_data_print) tf_dir_subject = os.path.join(tf_dir, 'subject_{:02d}'.format(sid + 1)) mat_data = loadmat(mat_file) mat_object = mat_data['data'] # Both framte rates should be 128Hz according to: # https://zenodo.org/record/1158410/#.XvqtpZNKjVs wav_fs = mat_object['fsample'] eeg_fs = mat_object['fsample'] trial_dict = {} for trial_idx, trial in enumerate(mat_object['trial']): eeg_signal = trial[:69, :].T audio_signal = trial[69:70, :].T p_eeg = preprocess.Preprocessor('eeg', eeg_fs, desired_frame_rate) ds_eeg_signal = p_eeg.resample(eeg_signal) p_audio = preprocess.Preprocessor('audio', wav_fs, desired_frame_rate) ds_audio_signal = p_audio.resample(audio_signal) eeg_dict = {'eeg_data': ds_eeg_signal} audio_files_dict = {'intensity': ds_audio_signal} trial_key = 'trial_{:02d}'.format(trial_idx + 1) trial_dict[trial_key] = [ audio_files_dict, ingest.MemoryBrainDataFile(eeg_dict, sr=desired_frame_rate) ] logging.info('Audio and EEG data shapes: %s, %s', ''.join(str(audio_signal.shape)), ''.join(str(eeg_signal.shape))) exp = ingest.BrainExperiment( trial_dict, sound_dir, eeg_dir, frame_rate=desired_frame_rate) exp.load_all_data(sound_dir, eeg_dir) exp.z_score_all_data() for trial in exp.iterate_trials(): trial.assemble_brain_data('eeg_data') for k in trial.model_features: logging.info('Trial # %s, audio shapes %s', str(k), ''.join(str(trial.model_features[k].shape))) make_if_not_exists(tf_dir_subject) all_ingested_files.extend(exp.write_all_data(tf_dir_subject)) write_summary(cache_dir, tf_dir, desired_frame_rate, all_ingested_files)
def ingest_data(self, cache_dir, tf_dir, desired_frame_rate): """Reads the Telluride4 data from mat file and ingest TFrecords. Args: cache_dir: Directory with the local copy of the original data tf_dir: Where to store the TFRecord files. desired_frame_rate: Desired frame rate after ingestion. """ mat_data = loadmat(os.path.join(cache_dir, 'Telluride2015.mat')) mat_objects = mat_data['data'] eeg_signals = mat_objects['eeg'] audio_signals = mat_objects['wav'] logging.info('Audio and EEG data shapes: %s, %s', ''.join(str(audio_signals.shape)), ''.join(str(eeg_signals.shape))) if audio_signals.shape[0] != 4: # Number of audio files. raise ValueError('Incorrect shapes for audio_signals (%s)' % audio_signals.shape) if eeg_signals.shape[0] != 32: # Number of trials. raise ValueError('Incorrect shapes for eeg_signals (%s)' % eeg_signals.shape) eeg_signals_count = eeg_signals.shape[0] make_if_not_exists(tf_dir) trial_dict = {} all_ingested_files = [] for i in range(eeg_signals_count): sound_dict = { 'intensity': audio_signals[i % 4], 'ones': np.ones(audio_signals[i % 4].shape, dtype=audio_signals[i % 4].dtype) } eeg_dict = {'eeg_data': eeg_signals[i]} trial_dict['trial_{:02d}'.format(i + 1)] = [ sound_dict, ingest.MemoryBrainDataFile(eeg_dict) ] eeg_dir = '.' sound_dir = '.' exp = ingest.BrainExperiment(trial_dict, sound_dir, eeg_dir, frame_rate=desired_frame_rate) exp.load_all_data(sound_dir, eeg_dir) exp.z_score_all_data() for trial in exp.iterate_trials(): trial.assemble_brain_data('eeg_data') for k in trial.model_features: logging.info('Trial#, Audio shape: %s, %s', str(k), ''.join(str(trial.model_features[k].shape))) make_if_not_exists(tf_dir) all_ingested_files = exp.write_all_data(tf_dir) write_summary(cache_dir, tf_dir, desired_frame_rate, all_ingested_files)
def test_memory_brain_data_file(self): one_data = np.arange(10) + 100 two_data = np.arange(10) + 200 channel_data = {'one': one_data, 'two': two_data} test_sr = 4 df = ingest.MemoryBrainDataFile(channel_data, test_sr) self.assertEqual(set(df.signal_names), set(channel_data.keys())) self.assertEqual(df.signal_fs('one'), test_sr) self.assertEqual(df.signal_fs('two'), test_sr) self.assertTrue(np.all(df.signal_values('one') == one_data)) self.assertTrue(np.all(df.signal_values('two') == two_data))
def test_brain_memory_experiment2(self): fs = 16000 audio_len = fs audio_data = np.random.randn(audio_len) frame_sr = 100 channel_one = np.arange(2 * frame_sr) # Use ints for easier debugging channel_two = np.arange(2 * frame_sr) + 200 eeg_data = {'C1': channel_one, 'C2': channel_two} df = ingest.MemoryBrainDataFile(eeg_data, frame_sr) trial_two_name = 'trial_2' experiment_dict = { trial_two_name: [{ 'audio_data': audio_data, 'audio_sr': fs }, df], } experiment = ingest.BrainExperiment(experiment_dict, self._test_dir, self._test_dir, frame_rate=frame_sr) self.assertTrue(experiment) experiment.load_all_data(self._test_dir, self._test_dir) summary = experiment.summary() self.assertIn('Found 1 trials', summary) self.assertIn('Trial trial_2: 2 EEG channels with 2s of eeg data', summary) for trial in experiment.iterate_trials(): trial.compute_intensity() trial.fix_eeg_offset(1.0) trial.assemble_brain_data(list(eeg_data.keys())) # Master copy of EEG data has moved from brain_data to audio_features dict brain_data = trial.audio_features['eeg'] # Now the eeg size is shorter, due to fix_eeg_offset above. self.assertEqual(brain_data.shape, (frame_sr, 2)) tmp_dir = '/tmp' experiment.write_all_data(tmp_dir) tf_file = os.path.join(tmp_dir, trial_two_name + '.tfrecords') (count, error) = ingest.count_tfrecords(tf_file) self.assertEqual(error, 0) self.assertEqual(count, frame_sr) file_data = ingest.read_tfrecords(tf_file) print('Read in data and found keys:', list(file_data.keys())) self.assertIn('eeg', file_data) self.assertIn('loudness', file_data) np.testing.assert_allclose( file_data['eeg'], np.hstack((np.reshape(channel_one[frame_sr:], (-1, 1)), np.reshape(channel_two[frame_sr:], (-1, 1)))))
def test_brain_memory_experiment(self): fs = 16000 audio_len = 2 * fs audio_data = np.random.randn(audio_len, 1) frame_sr = 100 eeg_len = 2 * frame_sr channel_one = np.arange(eeg_len) # Use ints for easier debugging channel_two = np.arange(eeg_len) + 200 eeg_data = collections.OrderedDict( (('C1', channel_one), ('C2', channel_two))) df = ingest.MemoryBrainDataFile(eeg_data, frame_sr) trial_two_name = 'trial_2' experiment_dict = { trial_two_name: [{ 'audio_data': audio_data, 'audio_sr': fs }, df], } experiment = ingest.BrainExperiment(experiment_dict, self._test_dir, self._test_dir, frame_rate=frame_sr) self.assertTrue(experiment) experiment.load_all_data(self._test_dir, self._test_dir) summary = experiment.summary() self.assertIn('Found 1 trials', summary) self.assertIn('Trial trial_2: 2 EEG channels with 2s of eeg data', summary) for trial in experiment.iterate_trials(): trial.assemble_brain_data(list(eeg_data.keys())) # Master copy of EEG data has moved from brain_data to model_features dict brain_data = trial.model_features['eeg'] self.assertEqual(brain_data.shape, (eeg_len, 2)) tmp_dir = flags.FLAGS.test_tmpdir or '/tmp' all_ingested_files = experiment.write_all_data(tmp_dir) self.assertLen(all_ingested_files, 1) tf_file = os.path.join(tmp_dir, trial_two_name + '.tfrecords') (_, error) = ingest.count_tfrecords(tf_file) self.assertEqual(error, 0) file_data = ingest.read_tfrecords(tf_file) self.assertIn('eeg', file_data) np.testing.assert_allclose( file_data['eeg'], np.hstack((np.reshape(channel_one[:eeg_len], (-1, 1)), np.reshape(channel_two[:eeg_len], (-1, 1)))))
def test_brain_experiment(self): one_data = np.arange(10) + 100 two_data = np.arange(10) + 200 channel_data = {'one': one_data, 'two': two_data} test_sr = 4 df = ingest.MemoryBrainDataFile(channel_data, test_sr) sound_filename = 'subj01_1ksamples.wav' trial_name = ingest.BrainExperiment.delete_suffix( sound_filename, '.wav') trial_dict = {trial_name: [sound_filename, df]} experiment = ingest.BrainExperiment(trial_dict, self._test_dir, self._test_dir) experiment.load_all_data(self._test_dir, self._test_dir) summary = experiment.summary() self.assertIn('Found 1 trials', summary) self.assertIn( 'Trial subj01_1ksamples: 2 EEG channels with 2.5s of ' 'eeg data', summary) experiment.z_score_all_data()
def ingest_data(self, cache_dir, tf_dir, desired_frame_rate): """Reads the Jens data from mat files and ingest it to TFrecords. Args: cache_dir: Local copy of the original archive file from the Internet tf_dir: Folder where tfrecord files are written out desired_frame_rate: Desired frame rate after ingestion. """ eeg_dir = '.' sound_dir = '.' frame_rate = 512 make_if_not_exists(tf_dir) # All subject directories from the cache directory all_dirs = tf.io.gfile.listdir(cache_dir) all_dirs_sub = sorted([f for f in all_dirs if f.startswith('sub-')]) print( 'Ingesting {} subject directories of Jens Hearing impaired data.'. format(len(all_dirs_sub))) for sid, subject_dir in enumerate(all_dirs_sub): # There is a single EEG and events file per subject eeg_file = os.path.join( cache_dir, subject_dir, 'eeg/{}_task-selectiveattention_eeg.bdf'.format(subject_dir)) events_file = os.path.join( cache_dir, subject_dir, 'eeg/{}_task-selectiveattention_events.tsv'.format( subject_dir)) # Read in events file and load the start times of attended and # unattended audio for all trials (48 trials, 32 with dual audio) # Loading the events tsv file into a pandas Dataframe object: # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html # - a tabular format. Each column in the table, accessed using a string # key holds an array of objects of a fixed type. Keys are the column # headers in the tsv file. events_df = pd.read_csv(events_file, sep='\t') # Subject 24 has events described in 2 parts hence the if case below # which pulls the remaining data from the second file. if sid == 23: events_file_part = os.path.join( cache_dir, subject_dir, 'eeg/{}_task-selectiveattention_run-2_events.tsv'.format( subject_dir)) events_df_part = pd.read_csv(events_file_part, sep='\t') events_df = pd.concat([events_df, events_df_part]) # Attended audio is called target and unattended audio called masker. # target_onset and marker_onset events indicate the start of both of those # audio streams. stim_file contains the filename storing the audio stream # and sample indicates the EEG sample number at which the audio stream # starts. start_samples = events_df[events_df['trigger_type'] == 'targetonset']['sample'].values masker_df = events_df[events_df['trigger_type'] == 'maskeronset'][[ 'sample', 'stim_file' ]] # Get ids of experiments with masker audio from file names masker_df['stim_file'] = masker_df['stim_file'].apply( lambda x: int(x.split('/')[-1][1:-4])) # Sanity check if len(start_samples) != 48 or len(masker_df) != 32: raise ValueError( 'Incorrect event counts for subject %s: %d/48 and %d/32' % (subject_dir, len(start_samples), len(masker_df))) # Load EEG file, which is in bdf format. print('Reading in {}'.format(eeg_file)) f = pyedflib.EdfReader(eeg_file) num_channels = f.signals_in_file sigbufs = np.zeros((f.getNSamples()[0], num_channels)) for i in np.arange(num_channels): sigbufs[:, i] = f.readSignal(i) # Stuff for trials tf_dir_subject = os.path.join(tf_dir, 'subject_{:02d}'.format(sid + 1)) trial_dict = {} target_audio_signals = [] # List storing target audios masker_audio_signals = [] # List storing masker audios chopped_sigbufs = [ ] # List storing chopped signals based on audio timing all_ingested_files = [] for trial_idx in range(1, 49): # Load the target audio file target_audio_signal = loadmat( os.path.join( cache_dir, 'derivatives/stimuli/sub{:03d}/target/t{:03d}.mat'. format(sid + 1, trial_idx)))['dat']['feat'] target_audio_signals.append(target_audio_signal) # Load EEG portion based on audio start time target_start_time = start_samples[trial_idx - 1] current_chopped_signal = sigbufs[ target_start_time:target_start_time + target_audio_signal.shape[0], :] chopped_sigbufs.append(current_chopped_signal) # Only trials having masker data will have start times. This is used to # distinguish between trials having dual speaker data v/s ones having # single speaker data. masker_start_time = masker_df[masker_df['stim_file'] == trial_idx]['sample'].values if masker_start_time: # Masker is offset from target audio by jitter seconds # Align accordingly masker_audio_file = os.path.join( cache_dir, 'derivatives/stimuli/sub{:03d}/masker/m{:03d}.mat'. format(sid + 1, trial_idx)) masker_audio_signal = loadmat( masker_audio_file)['dat']['feat'] start_time_diff = int(masker_start_time - target_start_time) masker_audio_signal = np.concatenate( (np.zeros(start_time_diff), masker_audio_signal[:-1 * start_time_diff])) trial_key = 'trial_{:02d}_dual_speaker'.format(trial_idx) else: # Trials with single audio - dummy masked masker_audio_signal = np.zeros_like(target_audio_signal) trial_key = 'trial_{:02d}_single_speaker'.format(trial_idx) masker_audio_signals.append(masker_audio_signal) assert len(masker_audio_signal) == len(target_audio_signal) eeg_dict = {'eeg_data': current_chopped_signal} audio_files_dict = { 'attended_intensity': target_audio_signal, 'unattended_intensity': masker_audio_signal } trial_dict[trial_key] = [ audio_files_dict, ingest.MemoryBrainDataFile(eeg_dict, sr=desired_frame_rate) ] logging.info('Audio and EEG data shapes: %s, %s', ''.join(str(target_audio_signal.shape)), ''.join(str(current_chopped_signal.shape))) chopped_sigbufs = np.vstack(chopped_sigbufs) target_audio_signal_arr = np.hstack(target_audio_signals) masker_audio_signal_arr = np.hstack(masker_audio_signals) print('Raw EEG shape:{}'.format(sigbufs.shape)) print('Cut EEG shape:{}'.format(chopped_sigbufs.shape)) print('Target Audio shape: {}'.format( target_audio_signal_arr.shape)) print('Masker audio shape: {}'.format( masker_audio_signal_arr.shape)) exp = ingest.BrainExperiment(trial_dict, sound_dir, eeg_dir, frame_rate=frame_rate) exp.load_all_data(sound_dir, eeg_dir) exp.z_score_all_data() for trial in exp.iterate_trials(): trial.assemble_brain_data('eeg_data') for k in trial.model_features: logging.info('Trial # %s, audio shapes %s', str(k), ''.join(str(trial.model_features[k].shape))) make_if_not_exists(tf_dir_subject) exp.write_all_data(tf_dir_subject) all_ingested_files = os.listdir(tf_dir_subject) all_ingested_files = [ os.path.join(tf_dir_subject, f) for f in all_ingested_files ] write_summary(cache_dir, tf_dir_subject, desired_frame_rate, all_ingested_files)