Beispiel #1
0
  def ingest_data(self, cache_dir, tf_dir, desired_frame_rate):
    """Reads the Jens data from mat files and ingest it to TFrecords.

    Args:
      cache_dir: Local copy of the original archive file from the Internet
      tf_dir: Folder where tfrecord files are written out
      desired_frame_rate: Desired frame rate after ingestion.
    """
    mat_files_list = sorted(tf.io.gfile.glob(os.path.join(cache_dir,
                                                          '*.mat')))
    eeg_dir = '.'
    sound_dir = '.'
    make_if_not_exists(tf_dir)

    print('Ingesting %d files of Jens data.' % len(mat_files_list),
          file=regression_data_print)
    all_ingested_files = []
    for sid, mat_file in enumerate(mat_files_list):
      print('Ingesting %s' % mat_file,
            file=regression_data_print)
      tf_dir_subject = os.path.join(tf_dir,
                                    'subject_{:02d}'.format(sid + 1))
      mat_data = loadmat(mat_file)
      mat_object = mat_data['data']
      # Both framte rates should be 128Hz according to:
      #   https://zenodo.org/record/1158410/#.XvqtpZNKjVs
      wav_fs = mat_object['fsample']
      eeg_fs = mat_object['fsample']
      trial_dict = {}
      for trial_idx, trial in enumerate(mat_object['trial']):
        eeg_signal = trial[:69, :].T
        audio_signal = trial[69:70, :].T
        p_eeg = preprocess.Preprocessor('eeg', eeg_fs, desired_frame_rate)
        ds_eeg_signal = p_eeg.resample(eeg_signal)
        p_audio = preprocess.Preprocessor('audio', wav_fs, desired_frame_rate)
        ds_audio_signal = p_audio.resample(audio_signal)
        eeg_dict = {'eeg_data': ds_eeg_signal}
        audio_files_dict = {'intensity': ds_audio_signal}
        trial_key = 'trial_{:02d}'.format(trial_idx + 1)
        trial_dict[trial_key] = [
            audio_files_dict,
            ingest.MemoryBrainDataFile(eeg_dict, sr=desired_frame_rate)
        ]
        logging.info('Audio and EEG data shapes: %s, %s',
                     ''.join(str(audio_signal.shape)),
                     ''.join(str(eeg_signal.shape)))
      exp = ingest.BrainExperiment(
          trial_dict, sound_dir, eeg_dir, frame_rate=desired_frame_rate)
      exp.load_all_data(sound_dir, eeg_dir)
      exp.z_score_all_data()
      for trial in exp.iterate_trials():
        trial.assemble_brain_data('eeg_data')
        for k in trial.model_features:
          logging.info('Trial # %s, audio shapes %s', str(k),
                       ''.join(str(trial.model_features[k].shape)))
      make_if_not_exists(tf_dir_subject)
      all_ingested_files.extend(exp.write_all_data(tf_dir_subject))

    write_summary(cache_dir, tf_dir, desired_frame_rate, all_ingested_files)
Beispiel #2
0
    def ingest_data(self, cache_dir, tf_dir, desired_frame_rate):
        """Reads the Telluride4 data from mat file and ingest TFrecords.

    Args:
      cache_dir: Directory with the local copy of the original data
      tf_dir: Where to store the TFRecord files.
      desired_frame_rate: Desired frame rate after ingestion.
    """
        mat_data = loadmat(os.path.join(cache_dir, 'Telluride2015.mat'))
        mat_objects = mat_data['data']

        eeg_signals = mat_objects['eeg']
        audio_signals = mat_objects['wav']
        logging.info('Audio and EEG data shapes: %s, %s',
                     ''.join(str(audio_signals.shape)),
                     ''.join(str(eeg_signals.shape)))
        if audio_signals.shape[0] != 4:  # Number of audio files.
            raise ValueError('Incorrect shapes for audio_signals (%s)' %
                             audio_signals.shape)
        if eeg_signals.shape[0] != 32:  # Number of trials.
            raise ValueError('Incorrect shapes for eeg_signals (%s)' %
                             eeg_signals.shape)
        eeg_signals_count = eeg_signals.shape[0]

        make_if_not_exists(tf_dir)
        trial_dict = {}
        all_ingested_files = []
        for i in range(eeg_signals_count):
            sound_dict = {
                'intensity':
                audio_signals[i % 4],
                'ones':
                np.ones(audio_signals[i % 4].shape,
                        dtype=audio_signals[i % 4].dtype)
            }
            eeg_dict = {'eeg_data': eeg_signals[i]}
            trial_dict['trial_{:02d}'.format(i + 1)] = [
                sound_dict, ingest.MemoryBrainDataFile(eeg_dict)
            ]

        eeg_dir = '.'
        sound_dir = '.'
        exp = ingest.BrainExperiment(trial_dict,
                                     sound_dir,
                                     eeg_dir,
                                     frame_rate=desired_frame_rate)
        exp.load_all_data(sound_dir, eeg_dir)
        exp.z_score_all_data()
        for trial in exp.iterate_trials():
            trial.assemble_brain_data('eeg_data')
            for k in trial.model_features:
                logging.info('Trial#, Audio shape: %s, %s', str(k),
                             ''.join(str(trial.model_features[k].shape)))
        make_if_not_exists(tf_dir)
        all_ingested_files = exp.write_all_data(tf_dir)

        write_summary(cache_dir, tf_dir, desired_frame_rate,
                      all_ingested_files)
Beispiel #3
0
 def test_memory_brain_data_file(self):
     one_data = np.arange(10) + 100
     two_data = np.arange(10) + 200
     channel_data = {'one': one_data, 'two': two_data}
     test_sr = 4
     df = ingest.MemoryBrainDataFile(channel_data, test_sr)
     self.assertEqual(set(df.signal_names), set(channel_data.keys()))
     self.assertEqual(df.signal_fs('one'), test_sr)
     self.assertEqual(df.signal_fs('two'), test_sr)
     self.assertTrue(np.all(df.signal_values('one') == one_data))
     self.assertTrue(np.all(df.signal_values('two') == two_data))
Beispiel #4
0
    def test_brain_memory_experiment2(self):
        fs = 16000
        audio_len = fs
        audio_data = np.random.randn(audio_len)

        frame_sr = 100
        channel_one = np.arange(2 * frame_sr)  # Use ints for easier debugging
        channel_two = np.arange(2 * frame_sr) + 200
        eeg_data = {'C1': channel_one, 'C2': channel_two}
        df = ingest.MemoryBrainDataFile(eeg_data, frame_sr)

        trial_two_name = 'trial_2'
        experiment_dict = {
            trial_two_name: [{
                'audio_data': audio_data,
                'audio_sr': fs
            }, df],
        }
        experiment = ingest.BrainExperiment(experiment_dict,
                                            self._test_dir,
                                            self._test_dir,
                                            frame_rate=frame_sr)
        self.assertTrue(experiment)
        experiment.load_all_data(self._test_dir, self._test_dir)
        summary = experiment.summary()
        self.assertIn('Found 1 trials', summary)
        self.assertIn('Trial trial_2: 2 EEG channels with 2s of eeg data',
                      summary)
        for trial in experiment.iterate_trials():
            trial.compute_intensity()
            trial.fix_eeg_offset(1.0)
            trial.assemble_brain_data(list(eeg_data.keys()))
            # Master copy of EEG data has moved from brain_data to audio_features dict
            brain_data = trial.audio_features['eeg']
            # Now the eeg size is shorter, due to fix_eeg_offset above.
            self.assertEqual(brain_data.shape, (frame_sr, 2))
        tmp_dir = '/tmp'
        experiment.write_all_data(tmp_dir)
        tf_file = os.path.join(tmp_dir, trial_two_name + '.tfrecords')

        (count, error) = ingest.count_tfrecords(tf_file)
        self.assertEqual(error, 0)
        self.assertEqual(count, frame_sr)

        file_data = ingest.read_tfrecords(tf_file)
        print('Read in data and found keys:', list(file_data.keys()))
        self.assertIn('eeg', file_data)
        self.assertIn('loudness', file_data)

        np.testing.assert_allclose(
            file_data['eeg'],
            np.hstack((np.reshape(channel_one[frame_sr:], (-1, 1)),
                       np.reshape(channel_two[frame_sr:], (-1, 1)))))
    def test_brain_memory_experiment(self):
        fs = 16000
        audio_len = 2 * fs
        audio_data = np.random.randn(audio_len, 1)

        frame_sr = 100
        eeg_len = 2 * frame_sr
        channel_one = np.arange(eeg_len)  # Use ints for easier debugging
        channel_two = np.arange(eeg_len) + 200
        eeg_data = collections.OrderedDict(
            (('C1', channel_one), ('C2', channel_two)))
        df = ingest.MemoryBrainDataFile(eeg_data, frame_sr)

        trial_two_name = 'trial_2'
        experiment_dict = {
            trial_two_name: [{
                'audio_data': audio_data,
                'audio_sr': fs
            }, df],
        }
        experiment = ingest.BrainExperiment(experiment_dict,
                                            self._test_dir,
                                            self._test_dir,
                                            frame_rate=frame_sr)
        self.assertTrue(experiment)
        experiment.load_all_data(self._test_dir, self._test_dir)
        summary = experiment.summary()
        self.assertIn('Found 1 trials', summary)
        self.assertIn('Trial trial_2: 2 EEG channels with 2s of eeg data',
                      summary)
        for trial in experiment.iterate_trials():
            trial.assemble_brain_data(list(eeg_data.keys()))
            # Master copy of EEG data has moved from brain_data to model_features dict
            brain_data = trial.model_features['eeg']
            self.assertEqual(brain_data.shape, (eeg_len, 2))
        tmp_dir = flags.FLAGS.test_tmpdir or '/tmp'
        all_ingested_files = experiment.write_all_data(tmp_dir)
        self.assertLen(all_ingested_files, 1)
        tf_file = os.path.join(tmp_dir, trial_two_name + '.tfrecords')

        (_, error) = ingest.count_tfrecords(tf_file)
        self.assertEqual(error, 0)

        file_data = ingest.read_tfrecords(tf_file)
        self.assertIn('eeg', file_data)

        np.testing.assert_allclose(
            file_data['eeg'],
            np.hstack((np.reshape(channel_one[:eeg_len], (-1, 1)),
                       np.reshape(channel_two[:eeg_len], (-1, 1)))))
Beispiel #6
0
 def test_brain_experiment(self):
     one_data = np.arange(10) + 100
     two_data = np.arange(10) + 200
     channel_data = {'one': one_data, 'two': two_data}
     test_sr = 4
     df = ingest.MemoryBrainDataFile(channel_data, test_sr)
     sound_filename = 'subj01_1ksamples.wav'
     trial_name = ingest.BrainExperiment.delete_suffix(
         sound_filename, '.wav')
     trial_dict = {trial_name: [sound_filename, df]}
     experiment = ingest.BrainExperiment(trial_dict, self._test_dir,
                                         self._test_dir)
     experiment.load_all_data(self._test_dir, self._test_dir)
     summary = experiment.summary()
     self.assertIn('Found 1 trials', summary)
     self.assertIn(
         'Trial subj01_1ksamples: 2 EEG channels with 2.5s of '
         'eeg data', summary)
     experiment.z_score_all_data()
Beispiel #7
0
    def ingest_data(self, cache_dir, tf_dir, desired_frame_rate):
        """Reads the Jens data from mat files and ingest it to TFrecords.

    Args:
      cache_dir: Local copy of the original archive file from the Internet
      tf_dir: Folder where tfrecord files are written out
      desired_frame_rate: Desired frame rate after ingestion.
    """
        eeg_dir = '.'
        sound_dir = '.'

        frame_rate = 512
        make_if_not_exists(tf_dir)

        # All subject directories from the cache directory
        all_dirs = tf.io.gfile.listdir(cache_dir)
        all_dirs_sub = sorted([f for f in all_dirs if f.startswith('sub-')])
        print(
            'Ingesting {} subject directories of Jens Hearing impaired data.'.
            format(len(all_dirs_sub)))

        for sid, subject_dir in enumerate(all_dirs_sub):
            # There is a single EEG and events file per subject
            eeg_file = os.path.join(
                cache_dir, subject_dir,
                'eeg/{}_task-selectiveattention_eeg.bdf'.format(subject_dir))
            events_file = os.path.join(
                cache_dir, subject_dir,
                'eeg/{}_task-selectiveattention_events.tsv'.format(
                    subject_dir))

            # Read in events file and load the start times of attended and
            # unattended audio for all trials (48 trials, 32 with dual audio)
            # Loading the events tsv file into a pandas Dataframe object:
            # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
            # - a tabular format. Each column in the table, accessed using a string
            # key holds an array of objects of a fixed type. Keys are the column
            # headers in the tsv file.
            events_df = pd.read_csv(events_file, sep='\t')
            # Subject 24 has events described in 2 parts hence the if case below
            # which pulls the remaining data from the second file.
            if sid == 23:
                events_file_part = os.path.join(
                    cache_dir, subject_dir,
                    'eeg/{}_task-selectiveattention_run-2_events.tsv'.format(
                        subject_dir))
                events_df_part = pd.read_csv(events_file_part, sep='\t')
                events_df = pd.concat([events_df, events_df_part])

            # Attended audio is called target and unattended audio called masker.
            # target_onset and marker_onset events indicate the start of both of those
            # audio streams. stim_file contains the filename storing the audio stream
            # and sample indicates the EEG sample number at which the audio stream
            # starts.
            start_samples = events_df[events_df['trigger_type'] ==
                                      'targetonset']['sample'].values
            masker_df = events_df[events_df['trigger_type'] == 'maskeronset'][[
                'sample', 'stim_file'
            ]]
            # Get ids of experiments with masker audio from file names
            masker_df['stim_file'] = masker_df['stim_file'].apply(
                lambda x: int(x.split('/')[-1][1:-4]))

            # Sanity check
            if len(start_samples) != 48 or len(masker_df) != 32:
                raise ValueError(
                    'Incorrect event counts for subject %s: %d/48 and %d/32' %
                    (subject_dir, len(start_samples), len(masker_df)))

            # Load EEG file, which is in bdf format.
            print('Reading in {}'.format(eeg_file))
            f = pyedflib.EdfReader(eeg_file)
            num_channels = f.signals_in_file

            sigbufs = np.zeros((f.getNSamples()[0], num_channels))
            for i in np.arange(num_channels):
                sigbufs[:, i] = f.readSignal(i)

            # Stuff for trials
            tf_dir_subject = os.path.join(tf_dir,
                                          'subject_{:02d}'.format(sid + 1))
            trial_dict = {}
            target_audio_signals = []  # List storing target audios
            masker_audio_signals = []  # List storing masker audios
            chopped_sigbufs = [
            ]  # List storing chopped signals based on audio timing
            all_ingested_files = []
            for trial_idx in range(1, 49):
                # Load the target audio file
                target_audio_signal = loadmat(
                    os.path.join(
                        cache_dir,
                        'derivatives/stimuli/sub{:03d}/target/t{:03d}.mat'.
                        format(sid + 1, trial_idx)))['dat']['feat']
                target_audio_signals.append(target_audio_signal)
                # Load EEG portion based on audio start time
                target_start_time = start_samples[trial_idx - 1]
                current_chopped_signal = sigbufs[
                    target_start_time:target_start_time +
                    target_audio_signal.shape[0], :]
                chopped_sigbufs.append(current_chopped_signal)

                # Only trials having masker data will have start times. This is used to
                # distinguish between trials having dual speaker data v/s ones having
                # single speaker data.
                masker_start_time = masker_df[masker_df['stim_file'] ==
                                              trial_idx]['sample'].values
                if masker_start_time:
                    # Masker is offset from target audio by jitter seconds
                    # Align accordingly
                    masker_audio_file = os.path.join(
                        cache_dir,
                        'derivatives/stimuli/sub{:03d}/masker/m{:03d}.mat'.
                        format(sid + 1, trial_idx))
                    masker_audio_signal = loadmat(
                        masker_audio_file)['dat']['feat']
                    start_time_diff = int(masker_start_time -
                                          target_start_time)
                    masker_audio_signal = np.concatenate(
                        (np.zeros(start_time_diff),
                         masker_audio_signal[:-1 * start_time_diff]))
                    trial_key = 'trial_{:02d}_dual_speaker'.format(trial_idx)
                else:
                    # Trials with single audio - dummy masked
                    masker_audio_signal = np.zeros_like(target_audio_signal)
                    trial_key = 'trial_{:02d}_single_speaker'.format(trial_idx)
                masker_audio_signals.append(masker_audio_signal)

                assert len(masker_audio_signal) == len(target_audio_signal)
                eeg_dict = {'eeg_data': current_chopped_signal}
                audio_files_dict = {
                    'attended_intensity': target_audio_signal,
                    'unattended_intensity': masker_audio_signal
                }
                trial_dict[trial_key] = [
                    audio_files_dict,
                    ingest.MemoryBrainDataFile(eeg_dict, sr=desired_frame_rate)
                ]
                logging.info('Audio and EEG data shapes: %s, %s',
                             ''.join(str(target_audio_signal.shape)),
                             ''.join(str(current_chopped_signal.shape)))
            chopped_sigbufs = np.vstack(chopped_sigbufs)
            target_audio_signal_arr = np.hstack(target_audio_signals)
            masker_audio_signal_arr = np.hstack(masker_audio_signals)
            print('Raw EEG shape:{}'.format(sigbufs.shape))
            print('Cut EEG shape:{}'.format(chopped_sigbufs.shape))
            print('Target Audio shape: {}'.format(
                target_audio_signal_arr.shape))
            print('Masker audio shape: {}'.format(
                masker_audio_signal_arr.shape))
            exp = ingest.BrainExperiment(trial_dict,
                                         sound_dir,
                                         eeg_dir,
                                         frame_rate=frame_rate)
            exp.load_all_data(sound_dir, eeg_dir)
            exp.z_score_all_data()
            for trial in exp.iterate_trials():
                trial.assemble_brain_data('eeg_data')
                for k in trial.model_features:
                    logging.info('Trial # %s, audio shapes %s', str(k),
                                 ''.join(str(trial.model_features[k].shape)))
            make_if_not_exists(tf_dir_subject)
            exp.write_all_data(tf_dir_subject)
            all_ingested_files = os.listdir(tf_dir_subject)
            all_ingested_files = [
                os.path.join(tf_dir_subject, f) for f in all_ingested_files
            ]
            write_summary(cache_dir, tf_dir_subject, desired_frame_rate,
                          all_ingested_files)