Example #1
0
 def simple_edss(edss):
     '''
     Use only a few columns so that we don't make 21*20 coherence pairs
     '''
     all_channels = util_funcs.get_common_channel_names()
     subset_channels = [all_channels.index(channel) for channel in complex_feature_channels]
     return [(datum[0][:, subset_channels], datum[1]) for datum in edss]
Example #2
0
def use_all_channels_for_coherence_detect_knn():
    complex_feature_channels = util_funcs.get_common_channel_names() #returns all the channels
    train_pkl="/home/msaqib/trainSeizureData_expanded.pkl"
    valid_pkl="/home/msaqib/validSeizureData_expanded.pkl"
    test_pkl="/home/msaqib/testSeizureData_expanded.pkl"
    random_under_sample_data_gen = True
    use_simple_hand_engineered_features = False
Example #3
0
def use_lstm():
    discretize_age = False
    output_size = 1
    use_simple_lstm = True
    kbins_encoding = "onehot-dense"
    window = 5
    early_stopping = True
    patience = 10
    # variable batch, variable time steps, but constant num features
    input_shape = (None, None,
                   (len(read.EdfFFTDatasetTransformer.freq_bins) - 1) *
                   len(util_funcs.get_common_channel_names()))
def run_prep(file_name, annotation, split="train"):
        data = mne.io.read_raw_edf(file_name, preload=True)
        data = data.pick_channels(util_funcs.get_common_channel_names()) # use the 21 channels guaranteed in each sample
        data = data.reorder_channels(util_funcs.get_common_channel_names())
        data.rename_channels(constants.MNE_CHANNEL_EDF_MAPPING)
        data.resample(512) #upsample to highest frequency, as per best practice
        data.set_eeg_reference()

        data.set_montage("standard_1020")
        data.filter(1, 50)

        montage_kind = "standard_1020"
        maxTime = annotation.index.max()/pd.Timedelta(seconds=1)
        montage = mne.channels.make_standard_montage(montage_kind)
        ref, patient, session, token = read.parse_edf_token_path_structure(file_name)

        # for i in range(int(maxTime/2)):
        basePath = f"/n/scratch2/ms994/medium_size/{split}/{patient}/{session}/{token}/"
        Path(basePath).mkdir( parents=True, exist_ok=True)


        shutil.copyfile(file_name[:-4]+".tse", f"{basePath}label.tse")
        shutil.copyfile(file_name[:-4]+".lbl", f"{basePath}montage.lbl")
        shutil.copyfile(file_name[:-9]+".txt", f"{basePath}notes.txt")

        dataDict = Dict()

        for i in range(int(maxTime/2) - 1):
            croppedData = data.copy().crop(i*2, i*2 + 4)
            croppedData.resample(constants.COMMON_FREQ) #resample to minimum
            dataDict[i].index = i
            dataDict[i].data = croppedData
            dataDict[i].start = i*2
            dataDict[i].end = i*2 + 4
            if (i % 500 == 499): # save up to 500 separate data segments at a time to avoid IO bottleneck in scratch2, but also to avoid creating any pickle that is too big to parse_edf_token_path_structure
                pickle.dump(dataDict, open(basePath+f"intermediate_{int(np.ceil(i/500))}", "wb"))
                dataDict = Dict()
        pickle.dump(dataDict, open(basePath+f"intermediate_{int(np.ceil(i/500))}", "wb"))
        print(f"COMPLETED {file_name}")
Example #5
0
 def __init__(
         self,
         data_split,
         ref,
         num_files=None,
         resample=pd.Timedelta(seconds=constants.COMMON_DELTA),
         start_offset=pd.Timedelta(
             seconds=0),  #start at 0 unless if we want something different
         max_length=None,
         expand_tse=False,  #Save memory, don't try to make time by annotation df
         dtype=np.float32,
         n_process=None,
         use_average_ref_names=True,
         filter=True,
         lp_cutoff=1,
         hp_cutoff=50,  #get close to nyq without actually hitting it
         order_filt=5,
         columns_to_use=util_funcs.get_common_channel_names(),
         use_numpy=False,
         specific_seiz_types=None):
     self.data_split = data_split
     if n_process is None:
         n_process = mp.cpu_count()
     self.n_process = n_process
     self.ref = ref
     self.resample = resample
     self.dtype = dtype
     self.start_offset = start_offset
     self.max_length = max_length
     self.manager = mp.Manager()
     self.edf_tokens = get_all_token_file_names(data_split, ref)
     self.specific_seiz_types = specific_seiz_types
     if self.specific_seiz_types is not None:
         util_funcs.g
     self.expand_tse = expand_tse
     self.use_average_ref_names = use_average_ref_names
     if num_files is not None:
         self.edf_tokens = self.edf_tokens[0:num_files]
     self.filter = filter
     self.hp_cutoff = hp_cutoff
     self.lp_cutoff = lp_cutoff
     self.order_filt = order_filt
     self.columns_to_use = columns_to_use
     self.use_numpy = use_numpy
Example #6
0
    def __init__(self,
                 edfRawData,
                 n_process=None,
                 coherence_all=True,
                 coherence_pairs=None,
                 average_coherence=True,
                 coherence_bin=None,
                 columns_to_use=util_funcs.get_common_channel_names(),
                 is_pandas=True,
                 is_tuple_data=True):
        """

        Parameters
        ----------
        edfRawData : DataFrame
            An array-like holding the data for coherence
        n_process : int
            number of processes to use when indexing a slice
        coherence_all : bool
            If to do pair-wise coherence on all channels, if so we increase
            num features to n*n-1
        coherence_pairs : list
            If coherence_all is false, pass in a list of tuples holding columns
            to run coherence measurements on
        average_coherence : bool
            If true, just do an average of all coherences over all represented
            frequencies. If False, use coherence_bin to histogram bin everything

        Returns
        -------
        CoherenceTransformer
            Array-like

        """
        self.edfRawData = edfRawData
        self.n_process = n_process
        self.is_pandas = is_pandas
        self.coherence_all = coherence_all
        self.coherence_pairs = coherence_pairs
        self.average_coherence = average_coherence
        self.coherence_bin = coherence_bin
        self.columns_to_use = columns_to_use
        self.is_pandas = is_pandas
        self.is_tuple_data = is_tuple_data
def get_random_channel_ordering():
    channel_ordering = [
        i for i in range(len(util_funcs.get_common_channel_names()))
    ]
    np.random.shuffle(channel_ordering)
    return channel_ordering
Example #8
0
def use_all_columns():
    columns_to_use = util_funcs.get_common_channel_names()
Example #9
0
    def __init__(
            self,
            data_split,
            ref,
            num_files=None,
            resample=pd.Timedelta(
                seconds=constants.COMMON_DELTA),
            max_length=pd.Timedelta(seconds=4),
            expand_tse=False, #Save memory, don't try to make time by annotation df
            dtype=np.float32,
            n_process=None,
            use_average_ref_names=True,
            filter=True,
            lp_cutoff=1,
            hp_cutoff=50, #get close to nyq without actually hitting it to avoid errors
            order_filt=5,
            columns_to_use=util_funcs.get_common_channel_names(),
            use_numpy=True,
            ensemble_mode=RANDOM_SAMPLE_ENSEMBLE,
            max_num_samples=20,
            file_lengths=None, #automatically populated if not given
            edf_tokens=None,
            labels=None, # labels that map to edf token level
            generate_sample_info=True
            ):
        if labels is not None:
            assert len(labels) == len(edf_tokens)
        self.data_split = data_split
        if n_process is None:
            n_process = mp.cpu_count()
        self.n_process = n_process
        self.ref = ref
        self.resample = resample
        self.dtype = dtype
        if (type(max_length) == int):
            max_length = max_length * pd.Timedelta(seconds=pd.Timedelta(constants.COMMON_DELTA))
        self.max_length = max_length
        self.manager = mp.Manager()
        if edf_tokens is None:
            self.edf_tokens = read.get_all_token_file_names(data_split, ref)
        else:
            self.edf_tokens = edf_tokens
        self.expand_tse = expand_tse
        self.use_average_ref_names = use_average_ref_names
        if num_files is not None:
            self.edf_tokens = self.edf_tokens[0:num_files]
        self.filter = filter
        self.hp_cutoff = hp_cutoff
        self.lp_cutoff = lp_cutoff
        self.order_filt = order_filt
        self.columns_to_use = columns_to_use
        self.use_numpy = use_numpy
        self.ensemble_mode = ensemble_mode
        self.max_num_samples = max_num_samples
        if file_lengths is None:
            file_lengths = util_funcs.get_file_sizes(data_split, ref)
        self.file_lengths=file_lengths
        self.labels = labels




        self.sampleInfo=Dict()
        if generate_sample_info:
            self.generateSampleInfo()
Example #10
0
    def __init__(
        self,
        segment_file_tuples,
        columns_to_use=util_funcs.get_common_channel_names(),
        use_numpy=True,
        lp_cutoff=1,
        hp_cutoff=50,
        random_under_sample=False,
        order_filt=5,
        include_seizure_type=False,
        mode=DETECT_MODE,
        resample=pd.Timedelta(seconds=constants.COMMON_DELTA),
        # num_splits_per_sample= None,
        gap = pd.Timedelta(seconds=1),
        # overlap = None,
        return_sequence_label=False,
        num_samples=None,
        max_bckg_samps_per_file=None,
        overlapping_augmentation=False,
        n_process=4,
        include_montage_channels=False, # which montage channels have seizure
        include_segment=False,
        shuffle = True,
    ):
        self.mode = mode
        self.n_process = n_process
        self.resample = resample
        self.segment_file_tuples = segment_file_tuples
        self.columns_to_use = columns_to_use
        self.use_numpy = use_numpy
        self.lp_cutoff = lp_cutoff
        self.hp_cutoff = hp_cutoff
        self.order_filt = order_filt
        self.sampleInfo = Dict()
        self.gap = gap
        # if overlap = None:
        #     self.overlap = gap
        # else:
        #     self.overlap = overlap
        self.include_seizure_type = include_seizure_type
        self.num_samples = num_samples
        self.return_sequence_label = False
        self.random_under_sample = random_under_sample
        self.overlapping_augmentation = overlapping_augmentation
        self.include_montage_channels = include_montage_channels
        self.include_segment = include_segment
        # self.num_splits_per_sample = num_splits_per_sample
        currentIndex = 0
        for token_file_path, segment in self.segment_file_tuples:
            if shuffle:
                segment = segment.reindex(np.random.permutation(segment.index)) #randomly sample from each eeg file
            num_bckg_samps_per_file = 0
            for time_period, label in segment.iteritems():
                # segment = segment.resample(gap).mode() #if gap isn't correct size, just resample
                if num_samples is not None and currentIndex >= self.num_samples:
                    break
                if max_bckg_samps_per_file is not None and num_bckg_samps_per_file >= max_bckg_samps_per_file and label == "bckg":
                    continue
                if (label != "bckg" and "sz" not in label and self.mode == EdfDatasetSegmentedSampler.DETECT_MODE):
                    continue #go to next, too close to seizure to be safe
                if (label == "postsz" or label == "presz"):
                    continue

                # for split_num in range(num_splits_per_sample):
                if self.mode == EdfDatasetSegmentedSampler.DETECT_MODE:
                    self.sampleInfo[currentIndex].label = ("sz" in label)


                if (label != "bckg" and label != "sample" and self.mode == EdfDatasetSegmentedSampler.PREDICT_MODE):
                    continue #go to next, too close to seizure to be safe or is seizure, we don't want to deal with this
                if self.mode == EdfDatasetSegmentedSampler.PREDICT_MODE:
                    self.sampleInfo[currentIndex].label = (label == "sample")

                if label == "bckg":
                    num_bckg_samps_per_file += 1

                if self.include_seizure_type:
                    self.sampleInfo[currentIndex].label = (self.sampleInfo[currentIndex].label, label) #attach the specific label on the EDSS

                self.sampleInfo[currentIndex].token_file_path = token_file_path
                if self.include_segment:
                    self.sampleInfo[currentIndex].label = (*self.sampleInfo[currentIndex].label, self.sampleInfo[currentIndex].token_file_path)
                self.sampleInfo[currentIndex].sample_num = (time_period) / self.gap
                self.sampleInfo[currentIndex].sample_width = self.gap
                currentIndex += 1
        if self.random_under_sample:
            self.balance()
Example #11
0
def edf_eeg_2_df(path,
                 resample=None,
                 dtype=np.float32,
                 start=0,
                 filter=True,
                 max_length=None):
    """ Transforms from EDF to pd.df, with channel labels as columns.
        This does not attempt to concatenate multiple time series but only takes
        a single edf filepath

    Parameters
    ----------
    path : str
        path of the edf file

    resample : pd.Timedelta
        if None, returns original data with original sampling
        otherwise, resamples to correct Timedelta using forward filling

    dtype : dtype
        used to reduce memory consumption (np.float64 can be expensive)

    start : int or pd.Timedelta
        which place to start at

    Returns
    -------
    pd.DataFrame
        index is time, columns is waveform channel label

    """
    global file_list, file_list_lock
    waiting_for_path = True
    while waiting_for_path:  #hack around pyedflib having access to only one file handle at a time, if file is open, don't do anything
        file_list_lock.acquire()
        if path not in file_list:
            file_list.add(path)
            waiting_for_path = False
        file_list_lock.release()

    with pyedflib.EdfReader(
            path, check_file_size=pyedflib.CHECK_FILE_SIZE) as reader:
        channel_names = [
            headerDict['label'] for headerDict in reader.getSignalHeaders()
        ]
        sample_rates = [
            headerDict['sample_rate']
            for headerDict in reader.getSignalHeaders()
        ]
        for headerDict in reader.getSignalHeaders():
            if headerDict["dimension"] != "uV" and headerDict[
                    "label"] in util_funcs.get_common_channel_names():
                raise Exception()
        start_time = pd.Timestamp(reader.getStartdatetime())
        all_channels = []
        for i, channel_name in enumerate(channel_names):
            if type(
                    start
            ) == pd.Timedelta:  #we ask for time t=1 s, then we take into account sample rate
                start_count_native_freq = start / pd.Timedelta(seconds=1 /
                                                               sample_rates[i])
            else:
                start_count_native_freq = start
            if max_length is None:  #read everything
                signal_data = reader.readSignal(i,
                                                start=start_count_native_freq)
            else:
                numStepsToRead = int(
                    np.ceil(
                        max_length / pd.Timedelta(seconds=1 / sample_rates[i]))
                ) + 5  #adding a fudge factor of 5 for any off by 1 errors
                if "messy_read_outputs" in read_config() and read_config(
                )["messy_read_outputs"]:
                    sys.stdout = open(os.devnull, "w")
                signal_data = reader.readSignal(i,
                                                start=start_count_native_freq,
                                                n=numStepsToRead)
                if "messy_read_outputs" in read_config() and read_config(
                )["messy_read_outputs"]:
                    sys.stdout = sys.__stdout__

            signal_data = pd.Series(signal_data,
                                    index=pd.date_range(
                                        start=start_time,
                                        freq=pd.Timedelta(seconds=1 /
                                                          sample_rates[i]),
                                        periods=len(signal_data)),
                                    name=channel_name)
            all_channels.append(signal_data)
    data = pd.concat(all_channels, axis=1)
    data.index = data.index - data.index[0]
    data = data.astype(dtype)
    if filter is not None:
        segSize = data.index[1] - data.index[0]
        data.apply(lambda col: filters.butter_bandpass_filter(
            col,
            lowcut=1,
            highcut=50,
            fs=pd.Timedelta(seconds=1) / segSize,
            order=5),
                   axis=0)
    if resample is not None:
        data = data.resample(resample).mean()

    waiting_for_path = True
    file_list_lock.acquire()
    file_list.remove(path)
    file_list_lock.release()

    return data