Esempio n. 1
0
    def setup(self, X, y):
        '''
        Inspects data to build crossvalidation folds, initialize results lists
        and dicts, and build other helper lists and dicts

        IN:
            SeriesModel
            X - pd dataframe - trial data.  see data structures.  Usually passed in to fit
            y - pd dataframe - trial labels.  see data structures.  Usually passed in to fit
        OUT: None
        '''
        start = time.time()
        ptf('\n>> i. Setting-up TriggeredSeriesModel ...', self.logfile)
        # Use parent class methods
        self.confusion_labels = self._build_confusion_labels(y)
        self.trial_lengths = self.find_trial_lengths(X)
        self.inspect_trial_shapes(X)
        self._build_results_dataframes(len(X))

        # define new methods
        self._build_crossvalidation_folds(y)

        end = time.time()
        ptf('\n>> Set-up completed (%s seconds) <<' % (end - start),
            self.logfile)
Esempio n. 2
0
def load_spots_files(x, root_folder, column_headers,
        columns_to_drop, fname = 'spots.txt', verbose=False, LOGFILE=None):
    # csv file created in a windows enivornment
    relpath = x['Folder'].replace(ntpath.sep, os.sep)
    data_file = os.path.join(root_folder, relpath, fname)
    if verbose:
        ptf(['Loading data from', data_file], LOGFILE)
    # load data file
    mini_df = pd.read_table(data_file, header=None)
    # some files have extra tab, creating extra data column.
    # Strip them
    if mini_df.values.shape[1]>241:
        mini_df.drop(241, inplace=True, axis=1)

    # add column_headers to df
    mini_df.columns = column_headers

    # drop spots
    mini_df.drop(columns_to_drop, inplace=True, axis=1)


    # convert filenames to minutes
    # mini_df['time'] = mini_df['time'].apply(timestamp_interpretter)
    ntimes = len(mini_df)
    mini_df['time'] = np.arange(0, 20*ntimes, 20)

    x['data'] = mini_df
    return x
Esempio n. 3
0
    def _trigger_score_one_fold(self,
                                yt,
                                yp,
                                probas,
                                t,
                                testtrain='test',
                                fold='all'):
        number_of_times = t
        fpr, tpr, thresholds = roc_curve(yt, probas[:, 1], pos_label=1)
        roc_auc = auc(fpr, tpr)
        if self.verbose and fold == 'all':
            ptf('%s results' % testtrain, self.logfile)
            ptf(
                mcm.classification_report_ovr(
                    yt, yp, self.confusion_labels['detection']), self.logfile)

        scores = mcm.scores_binary(yt, yp)
        # builds confusion matrix of TP, FP, etc. for the detection case
        cm = mcm.confusion_matrix_binary(yt, yp)
        # detection - populate scores
        overall_acc = accuracy_score(yt, yp)
        score_dict = self._trigger_populate_score_dict(cm,
                                                       scores,
                                                       number_of_times,
                                                       fpr,
                                                       tpr,
                                                       thresholds,
                                                       roc_auc,
                                                       overall_acc=overall_acc)

        score_dict['fold'] = fold
        if testtrain == 'train':
            self._append_row_to_df(self.trigger_scores, score_dict)
        else:
            self._append_row_to_df(self.trigger_scores_test, score_dict)
Esempio n. 4
0
def reload_data(LOGFILE = None, PICKLE_DATA = True,
    root_folder = 'Shared Sepsis Data', csv_filename = 'Sepsis_JCM.csv'):
    '''
    Reloads raw_data from folders.
    IN:
        LOGFILE -  fileobj - an open text file where logs are written
        PICKLE_DATA - bool - whether to pickle data once loaded
        root_folder - str - relative path to top level folder for all data and csv_file
        csv_filename - str - name of csv file containing the trial labels and locations
    '''
    csv_file = os.path.join(root_folder, csv_filename)

    X, y, used_column_headers, df, df_raw = load_data(root_folder, csv_file,
        verbose=False, LOGFILE=LOGFILE)

    # pickle data for later loading efficiency
    if PICKLE_DATA:
        start = time.time()
        ptf( '\n>> Pickling data ...\n', LOGFILE)
        for z, zname in izip([X,y,used_column_headers], PICKLE_NAMES):
            my_pickle(z, zname)
        end = time.time()
        ptf( 'Data pickled in %d seconds (%d total trials)' % ((end-start),
            len(X)), LOGFILE)

    return X, y, used_column_headers, df, df_raw
Esempio n. 5
0
def print_run_details(X, sm, LOGFILE=None):
    # prints other run details
    ptf('\n\n>> Run details <<')
    ptf('\tntrials: %d' % len(X), LOGFILE)
    ptf('\tntimes: %d' % len(sm.times), LOGFILE)
    ptf('\n\n>> Other model details ', LOGFILE)
    ptf(sm, LOGFILE)
    def setup(self, X, y):
        '''
        Inspects data to build crossvalidation folds, initialize results lists
        and dicts, and build other helper lists and dicts

        IN:
            SeriesModel
            X - pd dataframe - trial data.  see data structures.  Usually passed in to fit
            y - pd dataframe - trial labels.  see data structures.  Usually passed in to fit
        OUT: None
        '''
        start = time.time()
        ptf('\n>> i. Setting-up TriggeredSeriesModel ...', self.logfile)
        # Use parent class methods
        self.confusion_labels = self._build_confusion_labels(y)
        self.trial_lengths = self.find_trial_lengths(X)
        self.inspect_trial_shapes(X)
        self._build_results_dataframes(len(X))

        # define new methods
        self._build_crossvalidation_folds(y)


        end = time.time()
        ptf('\n>> Set-up completed (%s seconds) <<' % (end-start), self.logfile)
Esempio n. 7
0
def load_spots_files(x,
                     root_folder,
                     column_headers,
                     columns_to_drop,
                     fname='spots.txt',
                     verbose=False,
                     LOGFILE=None):
    # csv file created in a windows enivornment
    relpath = x['Folder'].replace(ntpath.sep, os.sep)
    data_file = os.path.join(root_folder, relpath, fname)
    if verbose:
        ptf(['Loading data from', data_file], LOGFILE)
    # load data file
    mini_df = pd.read_table(data_file, header=None)
    # some files have extra tab, creating extra data column.
    # Strip them
    if mini_df.values.shape[1] > 241:
        mini_df.drop(241, inplace=True, axis=1)

    # add column_headers to df
    mini_df.columns = column_headers

    # drop spots
    mini_df.drop(columns_to_drop, inplace=True, axis=1)

    # convert filenames to minutes
    # mini_df['time'] = mini_df['time'].apply(timestamp_interpretter)
    ntimes = len(mini_df)
    mini_df['time'] = np.arange(0, 20 * ntimes, 20)

    x['data'] = mini_df
    return x
    def trigger_predict(self, model_detection, X_test, fold, t):
        number_of_times = t
        if self.verbose:
            ptf( 'Predicting detection fold:%d, nt:%d ...' % (fold, number_of_times), self.logfile)

        y_predict_detection = model_detection.predict(X_test)
        y_probabilities_detection = model_detection.predict_proba(X_test)

        return y_predict_detection, y_probabilities_detection
Esempio n. 9
0
    def trigger_predict(self, model_detection, X_test, fold, t):
        number_of_times = t
        if self.verbose:
            ptf(
                'Predicting detection fold:%d, nt:%d ...' %
                (fold, number_of_times), self.logfile)

        y_predict_detection = model_detection.predict(X_test)
        y_probabilities_detection = model_detection.predict_proba(X_test)

        return y_predict_detection, y_probabilities_detection
Esempio n. 10
0
def save_model(sm, RUNID, MODELFILENAME, LOGFILE=None):
    '''
    Saves model to file

    IN:
        sm - SeriesModel - SeriesModel or TriggeredSeriesModel for this run
        RUNID - str - str name for the folder where file will be saved
        MODELFILENAME - str - filename SeriesModel will be saved to
        LOGFILE - fild obj - open logfile for outputting print statements
    '''
    model_file = open('./' + RUNID + '/' + MODELFILENAME, 'wb')
    ptf('\n>> Writing model results to %s' % MODELFILENAME, LOGFILE)
    pickle.dump(sm, model_file, -1)
    model_file.close()
Esempio n. 11
0
    def trigger_train(self, X, y, fold, t):
        '''
        Trains models for a timestep, fold
        IN:
            TriggeredSeriesModel
            X - nparrays - final features for this train fold, timestep (ntrials X nfeatures)
            y - dict of nparrays - labels for this train fold (ntrials) for each label class (key)
            fold - int - fold index
                NOTE: only the detection label is used in this implementation, but all are passed
                in for backwords compatibility with series model
            t - int - time index
        OUT:
            models - model - trained model for this timestep, fold
            predictions - np array - train predictions nparray (ntrials in this fold)
            probabilities - np array - train probabilities nparray (ntrials in this fold X 2)
        '''
        number_of_times = t
        # (X_train_detection, X_train_gram, X_train_classification) = X
        np_X_detection = X
        print len(y)
        (y_train_detection, y_train_gram, y_train_classification) = y

        # fit detection
        if self.verbose:
            ptf(
                'Training detection fold:%d, nt:%d ...' %
                (fold, number_of_times), self.logfile)

        model_detection = self._fit_class(np_X_detection,
                                          y_train_detection,
                                          self.detection_base_model,
                                          self.detection_base_model_arguments,
                                          step=('detection t=%d_%d' %
                                                (fold, number_of_times)))

        # store model, predict
        y_predict_detection = model_detection.predict(np_X_detection)
        y_probabilities_detection = model_detection.predict_proba(
            np_X_detection)

        if not self.on_disk:
            self.trigger_models[fold][number_of_times] = model_detection

        else:
            self.pickle_time_step(model_detection,
                                  'trigger_model',
                                  t=number_of_times,
                                  fold=fold)

        return model_detection, y_predict_detection, y_probabilities_detection
Esempio n. 12
0
    def resample(self, X, y, t, fold):
        if not self.resample_method:
            return X, y
        else:
            start = time.time()
            if self.verbose:
                ptf('> Resampling for timestep %d, fold %d' % (t, fold),
                    self.logfile)

            # create resampler
            if self.resample_method == 'under':
                print 'UNDER SAMPLING is not implemented yet'
                return X, y
            elif self.resample_method == 'over':
                if self.oversample_method.lower() == 'smote':
                    resampler = SMOTE(**self.oversample_arguments)
                else:
                    print 'Your resampling method is not implemented yet'
                    return X, y

            print type(X), type(y)
            print X.shape, y[0].shape
            Xsmote, ysmote = resampler.fit_transform(X, y[0])
            # resample
            ysmote_tuple = self.build_smoted_label_tuple(ysmote, y, fold)
            # ysmote_df = self.build_smoted_label_df(ysmote, y, fold)
            # # find new folds
            # folds, ynewdf = self.find_new_folds(Xsmote, ysmote, y)

            if self.debug:
                print np.sum(y[0] == 0), np.sum(ysmote == 0)
                print np.sum(y[0] == 1), np.sum(ysmote == 1)

            if self.on_disk:
                self.pickle_time_step(ysmote_tuple,
                                      'trigger_resample_labels',
                                      fold=fold,
                                      t=t)
                self.pickle_time_step(Xsmote,
                                      'trigger_resample_features',
                                      fold=fold,
                                      t=t)
            else:
                self.trigger_resample_labels[fold][t] = ysmote_tuple
                self.trigger_resample_features[fold][t] = Xsmote

            end = time.time()
            if self.verbose:
                ptf('... %d s' % (end - start), self.logfile)
            return Xsmote, ysmote_tuple
Esempio n. 13
0
def save_model(sm, RUNID, MODELFILENAME, LOGFILE=None):
    '''
    Saves model to file

    IN:
        sm - SeriesModel - SeriesModel or TriggeredSeriesModel for this run
        RUNID - str - str name for the folder where file will be saved
        MODELFILENAME - str - filename SeriesModel will be saved to
        LOGFILE - fild obj - open logfile for outputting print statements
    '''
    model_file = open('./' + RUNID + '/' + MODELFILENAME, 'wb')
    ptf('\n>> Writing model results to %s' % MODELFILENAME, LOGFILE)
    pickle.dump(sm, model_file, -1)
    model_file.close()
Esempio n. 14
0
    def _regress(self, X):
        start = time.time()

        number_of_spots = X.iloc[0].shape[1]-1
        self.number_of_spots = number_of_spots
        coef_ = X.copy()
        scores_ = X.apply(lambda x: np.zeros(number_of_spots))
        for trial_index, x in enumerate(X):
            if trial_index % 100 == 0:
                if self.verbose:
                    ptf( 'Featurizing trial %d'%  trial_index, self.logfile)
            # regress coefficients are (poly order +1 )x(n_spots)
            coefficients = np.zeros((3, number_of_spots))
            scores = np.zeros(number_of_spots)

            t = x[:,0]
            t = t[self.reference_time:]
            for column_index in np.arange(x.shape[1]):
                # if column_index % 10 == 0:
                #     if self.verbose:
                #         ptf( 'ci:%d'%  column_index, self.logfile)
                # print column_index
                spot_index = column_index - 1
                if column_index == 0:
                    pass
                else:
                    # only fit data past the reference_time
                    # other data is 0 from preprocessing
                    # print t.shape, x[self.reference_time:, column_index].shape
                    # print t
                    # print x[self.reference_time:, column_index]
                    # print self.p_init
                    popt, pcov = curve_fit(self.sigmoid,
                        t,
                        x[self.reference_time:,column_index],
                        p0=self.p_init,
                        ftol=self.ftol,
                        xtol=self.xtol,
                        gtol=self.gtol,
                        maxfev=self.maxfev)
                    coefficients[:, spot_index] = popt
                    xpred = self.sigmoid(t, *popt)
                    scores[spot_index] = r2_score(x[self.reference_time:, column_index], xpred)
            coef_.iloc[trial_index] = coefficients
            scores_.iloc[trial_index] = scores

        end = time.time()
        ptf( 'Regressed %d trials in %d seconds' % (len(X), (end-start)), self.logfile)
        return coef_, scores_
Esempio n. 15
0
def print_run_details(X, sm, LOGFILE=None):
    '''
    prints other run details

    IN:
        X - pd DataSeries - Raw feature data (used to report the number of trials)
        sm - SeriesModel - SeriesModel or TriggeredSeriesModel for this run
        LOGFILE - fild obj - open logfile for outputting print statements
    OUT: None
    '''
    ptf('\n\n>> Run details <<')
    ptf('\tntrials: %d' % len(X), LOGFILE)
    ptf('\tntimes: %d' % len(sm.times), LOGFILE)
    ptf('\n\n>> Other model details ', LOGFILE)
    ptf(sm, LOGFILE)
Esempio n. 16
0
def print_run_details(X, sm, LOGFILE=None):
    '''
    prints other run details

    IN:
        X - pd DataSeries - Raw feature data (used to report the number of trials)
        sm - SeriesModel - SeriesModel or TriggeredSeriesModel for this run
        LOGFILE - fild obj - open logfile for outputting print statements
    OUT: None
    '''
    ptf('\n\n>> Run details <<')
    ptf('\tntrials: %d' % len(X), LOGFILE)
    ptf('\tntimes: %d' % len(sm.times), LOGFILE)
    ptf('\n\n>> Other model details ', LOGFILE)
    ptf(sm, LOGFILE)
Esempio n. 17
0
    def featurize_triggers(self, X, t):
        '''
        Extracts features for detection, gram, classification from pruneed
        data using conditions passed to init.
        IN:
            SeriesModel
            X - pd dataframe - preprocessed trial data
            t - int - time index
        OUT:
            X - np_array - extracted features (ntrials X nfeatures) as this timestep
        '''
        start = time.time()
        number_of_times = t
        # featurize, storing featurizers at each timestep
        if self.verbose:
            ptf('> 1. Featurizing nt=%d ...' % number_of_times, self.logfile)

        X_train = self._subset_data(X, number_of_times)
        if self.debug:
            print t, X.iloc[0].shape

        (X_trigger, trigger_times), trigger_featurizer = self._featurize_class(
            X_train, self.detection_base_featurizer,
            self.detection_base_featurizer_arguments)
        if self.debug:
            print t, X_trigger.iloc[0].shape, trigger_times.iloc[0].shape

        # convert to numpy arrays
        np_X_trigger = self._pandas_to_numpy(X_trigger)
        np_trigger_times = self._pandas_to_numpy(trigger_times)
        if self.debug:
            print 'Checking featurized shapes', np_X_trigger.shape, np_trigger_times.shape

        # store features
        if not self.on_disk:
            self.trigger_features[t] = np_X_trigger
            self.trigger_feature_times[t] = np_trigger_times
            self.trigger_featurizers[t] = trigger_featurizer
        else:
            self.pickle_time_step(np_X_trigger, 'trigger_features', t)
            self.pickle_time_step(np_trigger_times, 'trigger_feature_times', t)
            self.pickle_time_step(trigger_featurizer, 'trigger_featurizer', t)

        # Append results to results df later after scoring
        end = time.time()
        ptf('\n...(%s seconds) <' % (end - start), self.logfile)

        return np_X_trigger
    def featurize_triggers(self, X, t):
        '''
        Extracts features for detection, gram, classification from pruneed
        data using conditions passed to init.
        IN:
            SeriesModel
            X - pd dataframe - preprocessed trial data
            t - int - time index
        OUT:
            X - np_array - extracted features (ntrials X nfeatures) as this timestep
        '''
        start = time.time()
        number_of_times = t
        # featurize, storing featurizers at each timestep
        if self.verbose:
            ptf( '> 1. Featurizing nt=%d ...' % number_of_times, self.logfile)

        X_train = self._subset_data(X, number_of_times)
        if self.debug:
            print t, X.iloc[0].shape

        (X_trigger, trigger_times), trigger_featurizer = self._featurize_class(X_train,
            self.detection_base_featurizer, self.detection_base_featurizer_arguments)
        if self.debug:
            print t, X_trigger.iloc[0].shape, trigger_times.iloc[0].shape

        # convert to numpy arrays
        np_X_trigger = self._pandas_to_numpy(X_trigger)
        np_trigger_times = self._pandas_to_numpy(trigger_times)
        if self.debug:
            print 'Checking featurized shapes', np_X_trigger.shape, np_trigger_times.shape

        # store features
        if not self.on_disk:
            self.trigger_features[t] = np_X_trigger
            self.trigger_feature_times[t] = np_trigger_times
            self.trigger_featurizers[t] = trigger_featurizer
        else:
            self.pickle_time_step(np_X_trigger, 'trigger_features', t)
            self.pickle_time_step(np_trigger_times, 'trigger_feature_times', t)
            self.pickle_time_step(trigger_featurizer, 'trigger_featurizer', t)

        # Append results to results df later after scoring
        end = time.time()
        ptf('\n...(%s seconds) <' % (end-start), self.logfile)

        return np_X_trigger
    def resample(self, X, y, t, fold):
        if not self.resample_method:
            return X, y
        else:
            start = time.time()
            if self.verbose:
                ptf('> Resampling for timestep %d, fold %d' % (t, fold), self.logfile)

            # create resampler
            if self.resample_method == 'under':
                print 'UNDER SAMPLING is not implemented yet'
                return X, y
            elif self.resample_method == 'over':
                if self.oversample_method.lower() == 'smote':
                    resampler = SMOTE(**self.oversample_arguments)
                else:
                    print 'Your resampling method is not implemented yet'
                    return X, y

            print type(X), type(y)
            print X.shape, y[0].shape
            Xsmote, ysmote = resampler.fit_transform(X, y[0])
            # resample
            ysmote_tuple = self.build_smoted_label_tuple(ysmote, y, fold)
            # ysmote_df = self.build_smoted_label_df(ysmote, y, fold)
            # # find new folds
            # folds, ynewdf = self.find_new_folds(Xsmote, ysmote, y)

            if self.debug:
                print np.sum(y[0]==0), np.sum(ysmote == 0)
                print np.sum(y[0]==1), np.sum(ysmote == 1)

            if self.on_disk:
                self.pickle_time_step(ysmote_tuple, 'trigger_resample_labels', fold=fold, t=t)
                self.pickle_time_step(Xsmote, 'trigger_resample_features', fold=fold, t=t)
            else:
                self.trigger_resample_labels[fold][t] = ysmote_tuple
                self.trigger_resample_features[fold][t] = Xsmote

            end = time.time()
            if self.verbose:
                ptf('... %d s' % (end-start), self.logfile)
            return Xsmote, ysmote_tuple
    def trigger_train(self, X, y, fold, t):
        '''
        Trains models for a timestep, fold
        IN:
            TriggeredSeriesModel
            X - nparrays - final features for this train fold, timestep (ntrials X nfeatures)
            y - dict of nparrays - labels for this train fold (ntrials) for each label class (key)
            fold - int - fold index
                NOTE: only the detection label is used in this implementation, but all are passed
                in for backwords compatibility with series model
            t - int - time index
        OUT:
            models - model - trained model for this timestep, fold
            predictions - np array - train predictions nparray (ntrials in this fold)
            probabilities - np array - train probabilities nparray (ntrials in this fold X 2)
        '''
        number_of_times = t
        # (X_train_detection, X_train_gram, X_train_classification) = X
        np_X_detection = X
        print len(y)
        (y_train_detection, y_train_gram, y_train_classification) = y

        # fit detection
        if self.verbose:
            ptf( 'Training detection fold:%d, nt:%d ...' % (fold, number_of_times), self.logfile)

        model_detection = self._fit_class(np_X_detection,
            y_train_detection,
            self.detection_base_model,
            self.detection_base_model_arguments,
            step=('detection t=%d_%d' % (fold,number_of_times)))

        # store model, predict
        y_predict_detection = model_detection.predict(np_X_detection)
        y_probabilities_detection = model_detection.predict_proba(np_X_detection)

        if not self.on_disk:
            self.trigger_models[fold][number_of_times] = model_detection

        else:
            self.pickle_time_step(model_detection, 'trigger_model', t=number_of_times, fold=fold)

        return model_detection, y_predict_detection, y_probabilities_detection
Esempio n. 21
0
def reload_data(LOGFILE=None,
                PICKLE_DATA=True,
                root_folder='Shared Sepsis Data',
                csv_filename='Sepsis_JCM.csv'):
    '''
    Reloads raw_data from folders.
    IN:
        LOGFILE -  fileobj - an open text file where logs are written
        PICKLE_DATA - bool - whether to pickle data once loaded
        root_folder - str - relative path to top level folder for all data and
            csv_file
        csv_filename - str - name of csv file containing the trial labels and
            locations
    OUT:
        X - pd Series - Series of features.  Each row is a trial (index) and a
            number of features + 1 X number of times numpy array (data)
        y - pd DataFrame - labels data frame.  Each row is a trial (index) and
            the labels of each class the columns
        used_column_headers - list of str -
        df - pd DataFrame - DataFrame containing all trial data after elmination
            of extraneous spots, trials
        df_raw - pd DataFrame - DataFrame containing all trial data (before pruning)
    '''
    csv_file = os.path.join(root_folder, csv_filename)

    X, y, used_column_headers, df, df_raw = load_data(root_folder,
                                                      csv_file,
                                                      verbose=False,
                                                      LOGFILE=LOGFILE)

    # pickle data for later loading efficiency
    if PICKLE_DATA:
        start = time.time()
        ptf('\n>> Pickling data ...\n', LOGFILE)
        for z, zname in izip([X, y, used_column_headers], PICKLE_NAMES):
            my_pickle(z, zname)
        end = time.time()
        ptf(
            'Data pickled in %d seconds (%d total trials)' %
            ((end - start), len(X)), LOGFILE)

    return X, y, used_column_headers, df, df_raw
Esempio n. 22
0
    def _regress(self, X, model, poly):
        # timing of the regress step
        start = time.time()

        number_of_spots = X.iloc[0].shape[1]-1
        self.number_of_spots = number_of_spots
        coef_ = X.copy()
        scores_ = X.apply(lambda x: np.zeros(number_of_spots))
        for trial_index, x in enumerate(X):
            if trial_index % 100 == 0:
                if self.verbose:
                    ptf( 'Polynomial Featurizing trial %d'%  trial_index, self.logfile)
            # regress coefficients are (poly order +1 )x(n_spots)
            coefficients = np.zeros(((self.n+1), number_of_spots))
            scores = np.zeros(number_of_spots)
            # number of times different for each observation
            # QUESTION - what for trials of different lengths?
            #   -> maybe deal with this in the fit/predict steps?
            t = poly.fit_transform((x[:,0]).reshape(-1,1))

            # only regress on data past reference time
            t = t[self.reference_time:]

            for column_index in np.arange(x.shape[1]):
                spot_index = column_index - 1
                if column_index == 0:
                    pass
                else:
                    # only fit data past the reference_time
                    # other data is 0 from preprocessing
                    model.fit(t, x[self.reference_time:,column_index])
                    coefficients[:, spot_index] = model.coef_
                    scores[spot_index] = model.score(t, x[self.reference_time:, column_index])
                    # print trial_index, spot_index, model.coef_, scores[spot_index]
            coef_.iloc[trial_index] = coefficients
            scores_.iloc[trial_index] = scores

        end = time.time()
        # ptf( 'Regressed %d trials, n=%d in %d seconds' % (len(X), self.n, (end-start)), self.logfile)
        print 'PFR', coef_.iloc[0][:,0], X.iloc[0].shape
        return coef_, scores_
def make_df(sm, y, runid, logfile=None):
    trial_hash = make_trial_hash(y)

    ch = make_ch(sm)
    # set-up data frame
    sm.logfile = logfile
    tf_collection = pd.DataFrame(columns=ch)
    tft_collection = pd.DataFrame(columns=ch)

    # append to dataframes
    index_list = []
    for t in sm.times:
        if t==12:
            continue
        tf = sm.load_time_step('trigger_features', t=t)
        tft = sm.load_time_step('trigger_feature_times', t=t)
        # check for nans
        # x = [i for i,k in enumerate(tf) if np.isnan(k)]
        if np.sum(np.isnan(tf)):
            ptf('NANS in timestep %d for features' % t, logfile)
            ptf(x, logfile)

        # x = [i for i,k in enumerate(tft) if np.isnan(k)]
        if np.sum(np.isnan(tft)):
            ptf('NANS in timestep %d for feature_timess' % t, logfile)
            ptf(x, logfile)



        # print t, tf.shape, tft.shape
        df = pd.DataFrame(tf, columns=ch[:-2])
        df['timestep'] = t
        df['trial'] = df.index
        if len(index_list):
            df['trial'] = index_list
        else:
            df['trial'] = df['trial'].apply(lambda x: trial_hash[x])
            index_list = df['trial'].values
        tf_collection = tf_collection.append(df)


        df = pd.DataFrame(tft, columns=ch[:-2])
        df['timestep'] = t
        df['trial'] = index_list
        tft_collection = tft_collection.append(df)


    # drop nans
    tf_collection = tf_collection.dropna()
    tft_collection = tft_collection.dropna()
    return tf_collection, tft_collection
    def _trigger_score_one_fold(self, yt, yp, probas, t, testtrain='test', fold='all'):
        number_of_times = t
        fpr, tpr, thresholds = roc_curve(yt, probas[:,1], pos_label=1)
        roc_auc = auc(fpr, tpr)
        if self.verbose and fold=='all':
            ptf('%s results' % testtrain, self.logfile)
            ptf(mcm.classification_report_ovr(yt, yp, self.confusion_labels['detection']), self.logfile)

        scores = mcm.scores_binary(yt, yp)
        # builds confusion matrix of TP, FP, etc. for the detection case
        cm = mcm.confusion_matrix_binary(yt, yp)
        # detection - populate scores
        overall_acc = accuracy_score(yt, yp)
        score_dict = self._trigger_populate_score_dict(cm, scores, number_of_times,
            fpr, tpr, thresholds, roc_auc, overall_acc=overall_acc)

        score_dict['fold'] = fold
        if testtrain == 'train':
            self._append_row_to_df(self.trigger_scores, score_dict)
        else:
            self._append_row_to_df(self.trigger_scores_test, score_dict)
Esempio n. 25
0
def reload_data(LOGFILE = None, PICKLE_DATA = True,
    root_folder = 'Shared Sepsis Data', csv_filename = 'Sepsis_JCM.csv'):
    '''
    Reloads raw_data from folders.
    IN:
        LOGFILE -  fileobj - an open text file where logs are written
        PICKLE_DATA - bool - whether to pickle data once loaded
        root_folder - str - relative path to top level folder for all data and
            csv_file
        csv_filename - str - name of csv file containing the trial labels and
            locations
    OUT:
        X - pd Series - Series of features.  Each row is a trial (index) and a
            number of features + 1 X number of times numpy array (data)
        y - pd DataFrame - labels data frame.  Each row is a trial (index) and
            the labels of each class the columns
        used_column_headers - list of str -
        df - pd DataFrame - DataFrame containing all trial data after elmination
            of extraneous spots, trials
        df_raw - pd DataFrame - DataFrame containing all trial data (before pruning)
    '''
    csv_file = os.path.join(root_folder, csv_filename)

    X, y, used_column_headers, df, df_raw = load_data(root_folder, csv_file,
        verbose=False, LOGFILE=LOGFILE)

    # pickle data for later loading efficiency
    if PICKLE_DATA:
        start = time.time()
        ptf( '\n>> Pickling data ...\n', LOGFILE)
        for z, zname in izip([X,y,used_column_headers], PICKLE_NAMES):
            my_pickle(z, zname)
        end = time.time()
        ptf( 'Data pickled in %d seconds (%d total trials)' % ((end-start),
            len(X)), LOGFILE)

    return X, y, used_column_headers, df, df_raw
Esempio n. 26
0
def make_df(sm, y, runid, logfile=None):
    trial_hash = make_trial_hash(y)

    ch = make_ch(sm)
    # set-up data frame
    sm.logfile = logfile
    tf_collection = pd.DataFrame(columns=ch)
    tft_collection = pd.DataFrame(columns=ch)

    # append to dataframes
    index_list = []
    for t in sm.times:
        if t == 12:
            continue
        tf = sm.load_time_step('trigger_features', t=t)
        tft = sm.load_time_step('trigger_feature_times', t=t)
        # check for nans
        # x = [i for i,k in enumerate(tf) if np.isnan(k)]
        if np.sum(np.isnan(tf)):
            ptf('NANS in timestep %d for features' % t, logfile)
            ptf(x, logfile)

        # x = [i for i,k in enumerate(tft) if np.isnan(k)]
        if np.sum(np.isnan(tft)):
            ptf('NANS in timestep %d for feature_timess' % t, logfile)
            ptf(x, logfile)

        # print t, tf.shape, tft.shape
        df = pd.DataFrame(tf, columns=ch[:-2])
        df['timestep'] = t
        df['trial'] = df.index
        if len(index_list):
            df['trial'] = index_list
        else:
            df['trial'] = df['trial'].apply(lambda x: trial_hash[x])
            index_list = df['trial'].values
        tf_collection = tf_collection.append(df)

        df = pd.DataFrame(tft, columns=ch[:-2])
        df['timestep'] = t
        df['trial'] = index_list
        tft_collection = tft_collection.append(df)

    # drop nans
    tf_collection = tf_collection.dropna()
    tft_collection = tft_collection.dropna()
    return tf_collection, tft_collection
Esempio n. 27
0
def main(RUNID='run001', START_DT_STR=None, MODELFILENAME='sm', PICKLE_DATA=False,
    DO_TESTS=False, PROFILE=False, verbose=False, debug=False,
    RELOAD=False, n_cpus=1,
    PICKLE_NAMES=['Xdf.pkl', 'ydf.pkl', 'used_column_headers.pkl']):
    '''
    Runs our series model or triggered series model job based on the runtime
    conditions and run parameters.

    IN:
        RUNID - str - str name for the folder where output will be stored and the
            name of the json (without extension) containing run parameters
            for seriesmodel or triggeredseriesmodel
        START_DT_STR - str - timestamp as a string to append to the logfile.
            Set in the header global params of capstone
        MODELFILENAME - str - filename of model (for pickling)
        PICKLE_DATA - bool - if the raw data should be pickled after loading into
            a data frame
        DO_TESTS - bool - if unittests should be run (True), or a job run (False)
        PROFILE - bool - if memory profiling should be performed (True)
        verbose - bool - when set to true, verbose output
        debug - bool - whether a full dataset should be used (False), or a smaller
            set of time points (True)
        RELOAD - bool - whether data should be loaded from pickle (False), or
            reloaded from raw data (True).  Set to true only for first run on a
            new instance, then set to False for future runs to save load time.
        n_cpus - int - number of cpus to use for multiprocessing jobs.
        PICKLE_NAMES - list of str - list of the X (features) dataframe, y (labels)
            data and spots_used file names.  When RELOAD is set to True, this is
            the filenames where this data will be saved.  When RELOAD is set to
            False, this is where the data will be loaded from.
    OUT:
        None
    '''

    RUNID = command_line_process(RUNID)
    # prepare to run job
    LOGFILENAME = 'log_%s_%s.txt' % (RUNID, START_DT_STR)
    LOGFILE = create_logfile(RUNID, LOGFILENAME)

    # get the run conditions for the runid from the json
    # NOTE excludes verbose and debug flags - those are fit parameters
    # and exludes runid since that is set up above
    with open((RUNID + '.json')) as f:
        run_params = json.load(f, object_hook=ascii_encode_dict)

    # to see if more ram is used for more cpus
    n_jobs = run_params['detection_model_arguments']['n_jobs']

    ### Unittests ###
    if DO_TESTS:
        start = time.time()
        ptf( '\n>> Unpickling data ...\n', LOGFILE)
        X = my_unpickle(PICKLE_NAMES[0])
        y = my_unpickle(PICKLE_NAMES[1])
        used_column_headers = my_unpickle(PICKLE_NAMES[2])

        end = time.time()
        ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start),
            len(X)), LOGFILE)

        tsm_unit = run_tsm_unittests(X, y, used_column_headers.values,
            verbose=verbose, logfile=LOGFILE)
        # sm_unit = run_unittests(X_test, y_test, verbose=False)
    else:
        # ouptput run conditions to screen and logfile
        bigstart = time.time()

        # start memory profiling
        if PROFILE:
            tr, tr_sm = start_memory_profiling


        if RUNTYPE == 'trigger':
            ptf('*** %s - TRIGGERED SERIES MODEL - ***' % RUNID)
        elif RUNTYPE == 'series':
            ptf('*** %s - SERIES MODEL - ***' % RUNID)


        print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE,
            debug=debug, profile=PROFILE, verbose=verbose, start=True)

        if RELOAD:
            X, y, used_column_headers, df, df_raw = reload_data(LOGFILE, PICKLE_DATA)
        else:
            start = time.time()
            ptf( '\n>> Unpickling data ...\n', LOGFILE)
            X = my_unpickle(PICKLE_NAMES[0])
            y = my_unpickle(PICKLE_NAMES[1])
            used_column_headers = my_unpickle(PICKLE_NAMES[2])

            end = time.time()
            ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE)

        run_params['logfile'] = LOGFILE
        run_params['runid'] = RUNID

        # create model
        if RUNTYPE == 'trigger':
            sm = TriggeredSeriesModel(used_column_headers.values, **run_params)
        elif RUNTYPE == 'series':
            sm = SeriesModel(**run_params)

        # Altogether now
        print ('** DOING THE FIT **')
        sm.fit(X, y, verbose=verbose, debug=debug)

        bigend = time.time()

        ptf('====> %d seconds (%0.1f mins)' % ((bigend-bigstart), (bigend-bigstart)/60.0), LOGFILE)
        print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE,
            debug=debug, profile=PROFILE, verbose=verbose, start=False)

        print_run_details(X, sm, LOGFILE)

        save_model(sm, RUNID, MODELFILENAME, LOGFILE=LOGFILE)

        ## VIEW RESULTS
        if RUNTYPE == 'trigger':
            make_trigger_plots(sm, y, RUNID, debug=debug)
        elif RUNTYPE == 'series':
            make_series_plots(sm)

        if PROFILE:
            print_memory_profiles(sm, tr, tr_sm, LOGFILE = None)

    LOGFILE.close()
Esempio n. 28
0
def save_model(sm, RUNID, MODELFILENAME, LOGFILE=None):
    # saves model to file
    model_file = open('./' + RUNID + '/' + MODELFILENAME, 'wb')
    ptf('\n>> Writing model results to %s' % MODELFILENAME, LOGFILE)
    pickle.dump(sm, model_file, -1)
    model_file.close()
def make_kde_plot(df, spot, runid, title=None, cmap='Greens', plotclass=None, logfile=None, debug=False):
    plt.figure()
    ptf('Plot KDE %s - %s' % (title, spot), logfile)
    x,y = stack_rows(df, spot)
    ptf('%s, %s' % (x.shape, y.shape), logfile)
    ptf('Check for nans', logfile)
    ptf('%s, %s' % (np.sum(np.isnan(x)), np.sum(np.isnan(y))), logfile)
    ptf('computing kde...', logfile)
    sns.kdeplot(x,y, shade=True, cmap=cmap)

    plottitle = runid + '-' + spot + ' - KDE trigger vs t'
    if title:
        plottitle += ' - ' + title
    if plotclass:
        plottitle += ' - ' + plotclass

    plt.title(plottitle)
    plt.xlabel('t (hrs)')
    if title:
        plt.ylabel(title)
    else:
        plt.ylabel('trigger metric')
    filename = runid + '/' + runid + '-' + spot + ' - KDE trigger vs t'
    if title:
        filename += ' - ' + title
    if plotclass:
        filename += ' - ' + plotclass

    ptf('Saving plot %s' % filename, logfile)
    plt.savefig(filename, dpi=200)
    if debug:
        plt.show()
    else:
        plt.close()
Esempio n. 30
0
def print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE = None,
    debug = False, profile=False, verbose = True, start = True):
    # Outputs header, footer describing job conditions

    if start:
        ptf('====> Starting job ID: %s_%s <====' % (RUNID, START_DT_STR), LOGFILE)
    else:
        ptf('====> Completed job ID: %s_%s <====' % (RUNID, START_DT_STR), LOGFILE)
    ptf('\tn_jobs: %d\tn_cpus: %d' % (n_jobs, n_cpus), LOGFILE)
    ptf('\tdebug: %s' % debug, LOGFILE)
    ptf('\tprofile: %s' % profile, LOGFILE)
    ptf('\tverbose: %s' % verbose, LOGFILE)
    for k, v in run_params.iteritems():
        ptf('\t%s: %s' % (k,v), LOGFILE)
Esempio n. 31
0
def split_train_test(X, y, test_size=0.25, verbose=False, LOGFILE=None):
    # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,
    #                                                 stratify=y['classification'].unique())
    sss = StratifiedShuffleSplit(y=y['classification'], n_iter=1, test_size=test_size, random_state=1)
    for train_index, test_index in sss:

        ptf(['Train: ', len(train_index)], LOGFILE)
        ptf(['Test: ', len(test_index)], LOGFILE)

    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]

    if verbose:
        ptf( '\nTEST summary:', LOGFILE)
        ptf( y_test.groupby('classification').count(), LOGFILE)
        ptf( '\nTRAIN summary:', LOGFILE)
        ptf( y_train.groupby('classification').count(), LOGFILE)

    return X_train, X_test, y_train, y_test
Esempio n. 32
0
def print_memory_profiles(sm, tr, tr_sm, LOGFILE = None):
    '''prints report on memory profiles'''
    ptf( '\nSERIESMODEL profiling', LOGFILE)
    ptf( 'Look at size of seriesmodel object', LOGFILE)
    ptf( asizeof.asizeof(sm), LOGFILE)
    ptf( asizeof.asized(sm, detail=1).format(), LOGFILE)

    ptf( 'Look at how the SeriesModel class is doing', LOGFILE)
    tr_sm.create_snapshot()
    tr_sm.stats.print_summary()
    tr_sm.stats.print_summary() >> LOGFILE

    ptf( 'PROFILING', LOGFILE)
    ptf( 'Look at memory leaks up to this point', LOGFILE)
    tr.print_diff() >> LOGFILE
    tr.print_diff()
Esempio n. 33
0
def print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE = None,
    debug = False, profile=False, verbose = True, start = True):
    '''
    Outputs header, footer describing job conditions

    IN:
        run_params - dict - Dictionary from the runparameters json describing.
            Contains the initializiation conditions for the seriesmodel of this
            run.
        n_jobs - int - number of jobs to be used by parallelizable solvers in
            seriesmodel
        n_cpus - int - number of cpus available on this machine
        RUNID - str - str name for the folder where output will be stored and the
            name of the json (without extension) containing run parameters
            for seriesmodel or triggeredseriesmodel
        START_DT_STR - str - timestamp as a string to append to the logfile.  Set
            in the header global params of capstone
        debug - bool - whether a full dataset should be used (False), or a smaller
            set of time points (True).  Condition for main/seriesmodel.
        profile - bool - if memory profiling should be performed (True)
        verbose - bool - when set to true, verbose output. Condition for
            main/seriesmodel.
        start - bool - whether this is the header (True) or footer (False) of
            the run output
    OUT: None
    '''
    if start:
        ptf('====> Starting job ID: %s_%s <====' % (RUNID, START_DT_STR), LOGFILE)
    else:
        ptf('====> Completed job ID: %s_%s <====' % (RUNID, START_DT_STR), LOGFILE)
    ptf('\tn_jobs: %d\tn_cpus: %d' % (n_jobs, n_cpus), LOGFILE)
    ptf('\tdebug: %s' % debug, LOGFILE)
    ptf('\tprofile: %s' % profile, LOGFILE)
    ptf('\tverbose: %s' % verbose, LOGFILE)
    for k, v in run_params.iteritems():
        ptf('\t%s: %s' % (k,v), LOGFILE)
Esempio n. 34
0
def make_kde_plot(df,
                  spot,
                  runid,
                  title=None,
                  cmap='Greens',
                  plotclass=None,
                  logfile=None,
                  debug=False):
    plt.figure()
    ptf('Plot KDE %s - %s' % (title, spot), logfile)
    x, y = stack_rows(df, spot)
    ptf('%s, %s' % (x.shape, y.shape), logfile)
    ptf('Check for nans', logfile)
    ptf('%s, %s' % (np.sum(np.isnan(x)), np.sum(np.isnan(y))), logfile)
    ptf('computing kde...', logfile)
    sns.kdeplot(x, y, shade=True, cmap=cmap)

    plottitle = runid + '-' + spot + ' - KDE trigger vs t'
    if title:
        plottitle += ' - ' + title
    if plotclass:
        plottitle += ' - ' + plotclass

    plt.title(plottitle)
    plt.xlabel('t (hrs)')
    if title:
        plt.ylabel(title)
    else:
        plt.ylabel('trigger metric')
    filename = runid + '/' + runid + '-' + spot + ' - KDE trigger vs t'
    if title:
        filename += ' - ' + title
    if plotclass:
        filename += ' - ' + plotclass

    ptf('Saving plot %s' % filename, logfile)
    plt.savefig(filename, dpi=200)
    if debug:
        plt.show()
    else:
        plt.close()
Esempio n. 35
0
    def fit(self, X, y, verbose=False, trigger_only=True, debug=False):
        self.trigger_only = trigger_only
        self.verbose = verbose
        self.debug = debug

        # start with the second time
        tmin = self.reference_time + self.min_time
        self.times = np.arange(tmin, self.max_time, 1)

        # i) SETUP #
        self.setup(X, y)

        # Check trial integrity
        self._check_trial_integrity()

        # 0) PREPROCESS All trials
        X_preprocessed = self.preprocess(X)
        X_pruned = self.prune_spots(X_preprocessed, self.trigger_spots,
                                    self.column_headers)

        # check load state and run only needed times
        if self.load_state == 'featurize':
            ptf(
                '\n>> 1. Computing triggers from timestep %d ...' %
                self.load_time, self.logfile)
            run_times = self.make_run_times(self.times, self.load_time)
        else:
            ptf('\n>> 1. Computing triggers from first timestep %d ...' % tmin,
                self.logfile)
            run_times = self.times

        if debug:
            self.times = [30, 40, 50]
            run_times = self.times

        for t in run_times:
            start = time.time()
            if self.verbose:
                ptf('\n\nTIMESTEP %d...' % t, self.logfile)
            # 1) trigger_featurize
            X_featurized = self.featurize_triggers(X_pruned, t)

            # results to accumulate for this timestep
            y_train_true_timestep = []
            y_train_predict_timestep = []
            y_test_true_timestep = []
            y_test_predict_timestep = []
            y_train_probabilities = []
            y_test_probabilities = []

            for i, (fold, fold_indexes) in enumerate(self.folds.iteritems()):
                (X_train,
                 X_test) = self._subset_fold_triggers(X_featurized, fold)
                (y_train, y_test) = self._subset_fold_y(y, fold)

                # resample
                X_resampled, y_resampled = self.resample(
                    X_train, y_train, t, fold)
                y_train_resampled = y_resampled

                # 1) scale and/or reduce the data
                if self.verbose:
                    ptf('Scaling fold %d' % fold, self.logfile)
                X_scaled, scaler = self._scale_class(
                    X_resampled, self.detection_base_scaler,
                    self.detection_base_scaler_arguments)
                X_test_scaled = scaler.fit_transform(X_test)

                # 1A) reduce
                if self.verbose:
                    ptf('Reducing fold %d' % fold, self.logfile)
                X_reduced, reducer = self._reduce_class(
                    X_scaled, self.detection_base_reducer,
                    self.detection_base_reducer_arguments)
                X_test_reduced = reducer.transform(X_test_scaled)

                # 2) trigger_train
                if self.verbose:
                    ptf('Training fold %d' % fold, self.logfile)
                model, train_predictions, train_probabilities = \
                    self.trigger_train(X_reduced, y_train_resampled, fold, t)

                # 3) trigger_predict
                if self.verbose:
                    ptf('Predicting fold %d' % fold, self.logfile)
                test_predictions, test_probabilities = \
                    self.trigger_predict(model, X_test_reduced, fold, t)

                # 3A) store fold
                if self.verbose:
                    ptf('Storing fold %d' % fold, self.logfile)
                self._trigger_store_one_fold(
                    (train_predictions, train_probabilities),
                    (test_predictions, test_probabilities), fold, t)

                # 3B) score one fold
                if self.verbose:
                    ptf('Scoring fold %d' % fold, self.logfile)
                # print t, y_resampled[0].shape,
                self._trigger_score_one_fold(y_train_resampled[0],
                                             train_predictions,
                                             train_probabilities,
                                             t,
                                             testtrain='train',
                                             fold=fold)
                self._trigger_score_one_fold(y_test[0],
                                             test_predictions,
                                             test_probabilities,
                                             t,
                                             testtrain='test',
                                             fold=fold)

                # stack probas
                if fold == 0:
                    y_test_probabilities = test_probabilities
                    y_train_probabilities = train_probabilities
                else:
                    y_test_probabilities = np.vstack(
                        (y_test_probabilities, test_probabilities))
                    y_train_probabilities = np.vstack(
                        (y_train_probabilities, train_probabilities))

                y_train_true_timestep.extend(y_train_resampled[0])
                y_train_predict_timestep.extend(train_predictions)
                y_test_true_timestep.extend(y_test[0])
                y_test_predict_timestep.extend(test_predictions)
            # 4) trigger_score
            if self.verbose:
                ptf('Scoring timestep %d' % t, self.logfile)
            self._trigger_score_one_fold(y_train_true_timestep,
                                         y_train_predict_timestep,
                                         y_train_probabilities, t, 'train')
            self._trigger_score_one_fold(y_test_true_timestep,
                                         y_test_predict_timestep,
                                         y_test_probabilities, t, 'test')

        # if self.trigger_only:
        #     return X_featurized

        # 4A) Write avg tau to file for each trial.  We will pass this into triggered series model

        ## UPDATES FOR V2 BEYOND THIS ##

        # 5) tranform tau

        # for t in range(0, self.max_postdetection_time, 20):
        return X_featurized
Esempio n. 36
0
def main(RUNID='run001',
         START_DT_STR=None,
         MODELFILENAME='sm',
         PICKLE_DATA=False,
         DO_TESTS=False,
         PROFILE=False,
         verbose=False,
         debug=False,
         RELOAD=False,
         n_cpus=1,
         PICKLE_NAMES=['Xdf.pkl', 'ydf.pkl', 'used_column_headers.pkl']):
    '''
    Runs our series model or triggered series model job based on the runtime
    conditions and run parameters.

    IN:
        RUNID - str - str name for the folder where output will be stored and the
            name of the json (without extension) containing run parameters
            for seriesmodel or triggeredseriesmodel
        START_DT_STR - str - timestamp as a string to append to the logfile.
            Set in the header global params of capstone
        MODELFILENAME - str - filename of model (for pickling)
        PICKLE_DATA - bool - if the raw data should be pickled after loading into
            a data frame
        DO_TESTS - bool - if unittests should be run (True), or a job run (False)
        PROFILE - bool - if memory profiling should be performed (True)
        verbose - bool - when set to true, verbose output
        debug - bool - whether a full dataset should be used (False), or a smaller
            set of time points (True)
        RELOAD - bool - whether data should be loaded from pickle (False), or
            reloaded from raw data (True).  Set to true only for first run on a
            new instance, then set to False for future runs to save load time.
        n_cpus - int - number of cpus to use for multiprocessing jobs.
        PICKLE_NAMES - list of str - list of the X (features) dataframe, y (labels)
            data and spots_used file names.  When RELOAD is set to True, this is
            the filenames where this data will be saved.  When RELOAD is set to
            False, this is where the data will be loaded from.
    OUT:
        None
    '''

    RUNID = command_line_process(RUNID)
    # prepare to run job
    LOGFILENAME = 'log_%s_%s.txt' % (RUNID, START_DT_STR)
    LOGFILE = create_logfile(RUNID, LOGFILENAME)

    # get the run conditions for the runid from the json
    # NOTE excludes verbose and debug flags - those are fit parameters
    # and exludes runid since that is set up above
    with open((RUNID + '.json')) as f:
        run_params = json.load(f, object_hook=ascii_encode_dict)

    # to see if more ram is used for more cpus
    n_jobs = run_params['detection_model_arguments']['n_jobs']

    ### Unittests ###
    if DO_TESTS:
        start = time.time()
        ptf('\n>> Unpickling data ...\n', LOGFILE)
        X = my_unpickle(PICKLE_NAMES[0])
        y = my_unpickle(PICKLE_NAMES[1])
        used_column_headers = my_unpickle(PICKLE_NAMES[2])

        end = time.time()
        ptf(
            'Data unpickled in %d seconds (%d total trials)' %
            ((end - start), len(X)), LOGFILE)

        tsm_unit = run_tsm_unittests(X,
                                     y,
                                     used_column_headers.values,
                                     verbose=verbose,
                                     logfile=LOGFILE)
        # sm_unit = run_unittests(X_test, y_test, verbose=False)
    else:
        # ouptput run conditions to screen and logfile
        bigstart = time.time()

        # start memory profiling
        if PROFILE:
            tr, tr_sm = start_memory_profiling

        if RUNTYPE == 'trigger':
            ptf('*** %s - TRIGGERED SERIES MODEL - ***' % RUNID)
        elif RUNTYPE == 'series':
            ptf('*** %s - SERIES MODEL - ***' % RUNID)

        print_job_info(run_params,
                       n_jobs,
                       n_cpus,
                       RUNID,
                       START_DT_STR,
                       LOGFILE=LOGFILE,
                       debug=debug,
                       profile=PROFILE,
                       verbose=verbose,
                       start=True)

        if RELOAD:
            X, y, used_column_headers, df, df_raw = reload_data(
                LOGFILE, PICKLE_DATA)
        else:
            start = time.time()
            ptf('\n>> Unpickling data ...\n', LOGFILE)
            X = my_unpickle(PICKLE_NAMES[0])
            y = my_unpickle(PICKLE_NAMES[1])
            used_column_headers = my_unpickle(PICKLE_NAMES[2])

            end = time.time()
            ptf(
                'Data unpickled in %d seconds (%d total trials)' %
                ((end - start), len(X)), LOGFILE)

        run_params['logfile'] = LOGFILE
        run_params['runid'] = RUNID

        # create model
        if RUNTYPE == 'trigger':
            sm = TriggeredSeriesModel(used_column_headers.values, **run_params)
        elif RUNTYPE == 'series':
            sm = SeriesModel(**run_params)

        # Altogether now
        print('** DOING THE FIT **')
        sm.fit(X, y, verbose=verbose, debug=debug)

        bigend = time.time()

        ptf(
            '====> %d seconds (%0.1f mins)' % ((bigend - bigstart),
                                               (bigend - bigstart) / 60.0),
            LOGFILE)
        print_job_info(run_params,
                       n_jobs,
                       n_cpus,
                       RUNID,
                       START_DT_STR,
                       LOGFILE=LOGFILE,
                       debug=debug,
                       profile=PROFILE,
                       verbose=verbose,
                       start=False)

        print_run_details(X, sm, LOGFILE)

        save_model(sm, RUNID, MODELFILENAME, LOGFILE=LOGFILE)

        ## VIEW RESULTS
        if RUNTYPE == 'trigger':
            make_trigger_plots(sm, y, RUNID, debug=debug)
        elif RUNTYPE == 'series':
            make_series_plots(sm)

        if PROFILE:
            print_memory_profiles(sm, tr, tr_sm, LOGFILE=None)

    LOGFILE.close()
Esempio n. 37
0
def load_data(root_folder, csv_filename, verbose=False, LOGFILE=None):
    ptf('\n>> Loading csv...\n', LOGFILE)
    df_raw = pd.read_csv(csv_filename)

    # only work with "good" trials
    df_raw = df_raw[df_raw['Ignore'] != True]

    column_headers = create_column_headers()
    columns_to_drop = populate_columns_to_drop()

    start = time.time()
    ptf('\n>> Loading data files...\n', LOGFILE)
    df_raw = df_raw.apply(lambda x: load_spots_files(
        x, root_folder, column_headers, columns_to_drop, LOGFILE=LOGFILE),
                          axis=1)
    end = time.time()
    ptf(
        'Data loaded in %d seconds (%d total trials)' %
        ((end - start), len(df_raw)), LOGFILE)

    # DATA INSPECTION - finding values outside of 0, 4096
    # Reference_time must be greater than 1 if we use DII
    # see code and snippet below
    # All trials from the same day and at the same time (1: 20 minutes)
    # ==> instrumentation error at that time
    if verbose:
        start = time.time()
        ptf('Finding anomalous trials...', LOGFILE)
        an_df = find_data_anomalies(df_raw)
        end = time.time()
        ptf(
            'Anomalous trials found in %d seconds (%d trials):' %
            ((end - start), len(an_df)), LOGFILE)
        ptf(an_df, LOGFILE)
        '''
        Finding anomalous trials...
        Anomalous trial found in 44 seconds (13 trials):
        372          ([1], 20120504\BCB\E. coli 25922 10 CFU\F1)
        373         ([1], 20120504\BCB\E. coli 25922 10 CFU\F16)
        374        ([1], 20120504\BCB\S. aureus 29213 10 CFU\F7)
        375       ([1], 20120504\BCB\S. aureus 29213 10 CFU\F12)
        376       ([1], 20120504\BCB\S. aureus 29213 10 CFU\F21)
        377     ([1], 20120504\BCB\S. maltophilia Clinical A\F4)
        378    ([1], 20120504\BCB\S. maltophilia Clinical A\F13)
        379    ([1], 20120504\BCB\S. maltophilia Clinical A\F18)
        380    ([1], 20120504\BCB\S. maltophilia Clinical A\F23)
        381     ([1], 20120504\BCB\S. maltophilia Clinical B\F5)
        382    ([1], 20120504\BCB\S. maltophilia Clinical B\F10)
        383    ([1], 20120504\BCB\S. maltophilia Clinical B\F15)
        384    ([1], 20120504\BCB\S. maltophilia Clinical B\F20)
        '''

    # re-order 'data' part of frame for convenience
    # currently exists as a data frame with name columns
    # time, 2R, 2G, 2B .... 79R, 79G, 79B
    # need to be able to manipulate data as numpy arrays
    # keep column headers around for later use
    df, used_column_headers = prepare_data_frame(df_raw)

    ptf('Creating labels...', LOGFILE)
    label_dictionaries = create_labels_dictionaries()
    df = create_labels(df, label_dictionaries)

    # drop unwanted labels
    df = df[df['Ignore_label'] != True]

    X = df['data']
    y = df[['classification', 'gram', 'detection']]

    if verbose:
        ptf('\nSummary counts after cleaning:', LOGFILE)
        ptf(y.groupby('gram').count(), LOGFILE)
        ptf(y.groupby('detection').count(), LOGFILE)
        ptf(y.groupby('classification').count(), LOGFILE)

    return X, y, used_column_headers, df, df_raw
Esempio n. 38
0
def print_job_info(run_params,
                   n_jobs,
                   n_cpus,
                   RUNID,
                   START_DT_STR,
                   LOGFILE=None,
                   debug=False,
                   profile=False,
                   verbose=True,
                   start=True):
    '''
    Outputs header, footer describing job conditions

    IN:
        run_params - dict - Dictionary from the runparameters json describing.
            Contains the initializiation conditions for the seriesmodel of this
            run.
        n_jobs - int - number of jobs to be used by parallelizable solvers in
            seriesmodel
        n_cpus - int - number of cpus available on this machine
        RUNID - str - str name for the folder where output will be stored and the
            name of the json (without extension) containing run parameters
            for seriesmodel or triggeredseriesmodel
        START_DT_STR - str - timestamp as a string to append to the logfile.  Set
            in the header global params of capstone
        debug - bool - whether a full dataset should be used (False), or a smaller
            set of time points (True).  Condition for main/seriesmodel.
        profile - bool - if memory profiling should be performed (True)
        verbose - bool - when set to true, verbose output. Condition for
            main/seriesmodel.
        start - bool - whether this is the header (True) or footer (False) of
            the run output
    OUT: None
    '''
    if start:
        ptf('====> Starting job ID: %s_%s <====' % (RUNID, START_DT_STR),
            LOGFILE)
    else:
        ptf('====> Completed job ID: %s_%s <====' % (RUNID, START_DT_STR),
            LOGFILE)
    ptf('\tn_jobs: %d\tn_cpus: %d' % (n_jobs, n_cpus), LOGFILE)
    ptf('\tdebug: %s' % debug, LOGFILE)
    ptf('\tprofile: %s' % profile, LOGFILE)
    ptf('\tverbose: %s' % verbose, LOGFILE)
    for k, v in run_params.iteritems():
        ptf('\t%s: %s' % (k, v), LOGFILE)
Esempio n. 39
0
    LOGFILENAME = 'log_%s_%s.txt' % (RUNID, START_DT_STR)
    LOGFILE = create_logfile(RUNID, LOGFILENAME)

    # get the run conditions for the runid from the json
    # NOTE excludes verbose and debug flags - those are fit parameters
    # and exludes runid since that is set up above
    with open((RUNID + '.json')) as f:
        run_params = json.load(f, object_hook=ascii_encode_dict)

    # to see if more ram is used for more cpus
    n_jobs = run_params['detection_model_arguments']['n_jobs']

    ### Unittests ###
    if DO_TESTS:
        start = time.time()
        ptf( '\n>> Unpickling data ...\n', LOGFILE)
        X = my_unpickle(PICKLE_NAMES[0])
        y = my_unpickle(PICKLE_NAMES[1])
        used_column_headers = my_unpickle(PICKLE_NAMES[2])

        end = time.time()
        ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE)

        tsm_unit = run_tsm_unittests(X, y, used_column_headers.values, verbose=verbose, logfile=LOGFILE)
        # sm_unit = run_unittests(X_test, y_test, verbose=False)
    else:
        # ouptput run conditions to screen and logfile
        bigstart = time.time()

        # start memory profiling
        if PROFILE:
Esempio n. 40
0
def print_memory_profiles(sm, tr, tr_sm, LOGFILE = None):
    '''
    Prints report on memory profiles

    IN:
        sm - SeriesModel - SeriesModel object for this run
        tr - SummaryTracker - SummaryTracker object for the whole run
        tr_sm - ClassTrackers - ClassTracker object of SeriesModel
        LOGFILE - file obj - Open logfile for print output
    OUT: None
    '''
    ptf( '\nSERIESMODEL profiling', LOGFILE)
    ptf( 'Look at size of seriesmodel object', LOGFILE)
    ptf( asizeof.asizeof(sm), LOGFILE)
    ptf( asizeof.asized(sm, detail=1).format(), LOGFILE)

    ptf( 'Look at how the SeriesModel class is doing', LOGFILE)
    tr_sm.create_snapshot()
    tr_sm.stats.print_summary()
    tr_sm.stats.print_summary() >> LOGFILE

    ptf( 'PROFILING', LOGFILE)
    ptf( 'Look at memory leaks up to this point', LOGFILE)
    tr.print_diff() >> LOGFILE
    tr.print_diff()
Esempio n. 41
0
def split_train_test(X, y, test_size=0.25, verbose=False, LOGFILE=None):
    # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,
    #                                                 stratify=y['classification'].unique())
    sss = StratifiedShuffleSplit(y=y['classification'],
                                 n_iter=1,
                                 test_size=test_size,
                                 random_state=1)
    for train_index, test_index in sss:

        ptf(['Train: ', len(train_index)], LOGFILE)
        ptf(['Test: ', len(test_index)], LOGFILE)

    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]

    if verbose:
        ptf('\nTEST summary:', LOGFILE)
        ptf(y_test.groupby('classification').count(), LOGFILE)
        ptf('\nTRAIN summary:', LOGFILE)
        ptf(y_train.groupby('classification').count(), LOGFILE)

    return X_train, X_test, y_train, y_test
Esempio n. 42
0
def load_data(root_folder, csv_filename, verbose=False, LOGFILE=None):
    ptf( '\n>> Loading csv...\n', LOGFILE)
    df_raw = pd.read_csv(csv_filename)

    # only work with "good" trials
    df_raw = df_raw[df_raw['Ignore'] != True]

    column_headers = create_column_headers()
    columns_to_drop = populate_columns_to_drop()

    start = time.time()
    ptf( '\n>> Loading data files...\n', LOGFILE)
    df_raw = df_raw.apply(lambda x: load_spots_files(x, root_folder, column_headers,
                                                columns_to_drop, LOGFILE=LOGFILE), axis=1)
    end = time.time()
    ptf( 'Data loaded in %d seconds (%d total trials)' % ((end-start), len(df_raw)), LOGFILE)

    # DATA INSPECTION - finding values outside of 0, 4096
    # Reference_time must be greater than 1 if we use DII
    # see code and snippet below
    # All trials from the same day and at the same time (1: 20 minutes)
    # ==> instrumentation error at that time
    if verbose:
        start = time.time()
        ptf( 'Finding anomalous trials...', LOGFILE)
        an_df = find_data_anomalies(df_raw)
        end = time.time()
        ptf( 'Anomalous trials found in %d seconds (%d trials):' % ((end-start), len(an_df)), LOGFILE)
        ptf( an_df, LOGFILE)

        '''
        Finding anomalous trials...
        Anomalous trial found in 44 seconds (13 trials):
        372          ([1], 20120504\BCB\E. coli 25922 10 CFU\F1)
        373         ([1], 20120504\BCB\E. coli 25922 10 CFU\F16)
        374        ([1], 20120504\BCB\S. aureus 29213 10 CFU\F7)
        375       ([1], 20120504\BCB\S. aureus 29213 10 CFU\F12)
        376       ([1], 20120504\BCB\S. aureus 29213 10 CFU\F21)
        377     ([1], 20120504\BCB\S. maltophilia Clinical A\F4)
        378    ([1], 20120504\BCB\S. maltophilia Clinical A\F13)
        379    ([1], 20120504\BCB\S. maltophilia Clinical A\F18)
        380    ([1], 20120504\BCB\S. maltophilia Clinical A\F23)
        381     ([1], 20120504\BCB\S. maltophilia Clinical B\F5)
        382    ([1], 20120504\BCB\S. maltophilia Clinical B\F10)
        383    ([1], 20120504\BCB\S. maltophilia Clinical B\F15)
        384    ([1], 20120504\BCB\S. maltophilia Clinical B\F20)
        '''

    # re-order 'data' part of frame for convenience
    # currently exists as a data frame with name columns
    # time, 2R, 2G, 2B .... 79R, 79G, 79B
    # need to be able to manipulate data as numpy arrays
    # keep column headers around for later use
    df, used_column_headers = prepare_data_frame(df_raw)

    ptf( 'Creating labels...', LOGFILE)
    label_dictionaries = create_labels_dictionaries()
    df = create_labels(df, label_dictionaries)

    # drop unwanted labels
    df = df[df['Ignore_label'] != True]

    X = df['data']
    y = df[['classification', 'gram', 'detection']]

    if verbose:
        ptf( '\nSummary counts after cleaning:', LOGFILE)
        ptf( y.groupby('gram').count(), LOGFILE)
        ptf( y.groupby('detection').count(), LOGFILE)
        ptf( y.groupby('classification').count(), LOGFILE)

    return X, y, used_column_headers, df, df_raw
Esempio n. 43
0
def print_memory_profiles(sm, tr, tr_sm, LOGFILE=None):
    '''
    Prints report on memory profiles

    IN:
        sm - SeriesModel - SeriesModel object for this run
        tr - SummaryTracker - SummaryTracker object for the whole run
        tr_sm - ClassTrackers - ClassTracker object of SeriesModel
        LOGFILE - file obj - Open logfile for print output
    OUT: None
    '''
    ptf('\nSERIESMODEL profiling', LOGFILE)
    ptf('Look at size of seriesmodel object', LOGFILE)
    ptf(asizeof.asizeof(sm), LOGFILE)
    ptf(asizeof.asized(sm, detail=1).format(), LOGFILE)

    ptf('Look at how the SeriesModel class is doing', LOGFILE)
    tr_sm.create_snapshot()
    tr_sm.stats.print_summary()
    tr_sm.stats.print_summary() >> LOGFILE

    ptf('PROFILING', LOGFILE)
    ptf('Look at memory leaks up to this point', LOGFILE)
    tr.print_diff() >> LOGFILE
    tr.print_diff()
    def fit(self, X, y, verbose=False, trigger_only=True, debug=False):
        self.trigger_only = trigger_only
        self.verbose = verbose
        self.debug = debug

        # start with the second time
        tmin = self.reference_time + self.min_time
        self.times = np.arange(tmin, self.max_time, 1)

        # i) SETUP #
        self.setup(X,y)

        # Check trial integrity
        self._check_trial_integrity()

        # 0) PREPROCESS All trials
        X_preprocessed = self.preprocess(X)
        X_pruned = self.prune_spots(X_preprocessed, self.trigger_spots, self.column_headers)

        # check load state and run only needed times
        if self.load_state == 'featurize':
            ptf('\n>> 1. Computing triggers from timestep %d ...' % self.load_time, self.logfile)
            run_times = self.make_run_times(self.times, self.load_time)
        else:
            ptf('\n>> 1. Computing triggers from first timestep %d ...' % tmin, self.logfile)
            run_times = self.times

        if debug:
            self.times = [30,40,50]
            run_times = self.times

        for t in run_times:
            start = time.time()
            if self.verbose:
                ptf('\n\nTIMESTEP %d...' %t, self.logfile)
            # 1) trigger_featurize
            X_featurized = self.featurize_triggers(X_pruned, t)

            # results to accumulate for this timestep
            y_train_true_timestep = []
            y_train_predict_timestep = []
            y_test_true_timestep = []
            y_test_predict_timestep = []
            y_train_probabilities = []
            y_test_probabilities = []

            for i, (fold, fold_indexes) in enumerate(self.folds.iteritems()):
                (X_train, X_test) = self._subset_fold_triggers(X_featurized, fold)
                (y_train, y_test) = self._subset_fold_y(y, fold)


                # resample
                X_resampled, y_resampled = self.resample(X_train,y_train,t,fold)
                y_train_resampled = y_resampled

                # 1) scale and/or reduce the data
                if self.verbose:
                    ptf('Scaling fold %d' % fold, self.logfile)
                X_scaled, scaler = self._scale_class(X_resampled,
                    self.detection_base_scaler,
                    self.detection_base_scaler_arguments)
                X_test_scaled = scaler.fit_transform(X_test)

                # 1A) reduce
                if self.verbose:
                    ptf('Reducing fold %d' % fold, self.logfile)
                X_reduced, reducer = self._reduce_class(X_scaled,
                    self.detection_base_reducer,
                    self.detection_base_reducer_arguments)
                X_test_reduced = reducer.transform(X_test_scaled)

                # 2) trigger_train
                if self.verbose:
                    ptf('Training fold %d' % fold, self.logfile)
                model, train_predictions, train_probabilities = \
                    self.trigger_train(X_reduced, y_train_resampled, fold, t)

                # 3) trigger_predict
                if self.verbose:
                    ptf('Predicting fold %d' % fold, self.logfile)
                test_predictions, test_probabilities = \
                    self.trigger_predict(model, X_test_reduced, fold, t)

                # 3A) store fold
                if self.verbose:
                    ptf('Storing fold %d' % fold, self.logfile)
                self._trigger_store_one_fold(
                    (train_predictions, train_probabilities),
                    (test_predictions, test_probabilities),
                    fold, t
                )

                # 3B) score one fold
                if self.verbose:
                    ptf('Scoring fold %d' % fold, self.logfile)
                # print t, y_resampled[0].shape,
                self._trigger_score_one_fold(y_train_resampled[0],
                    train_predictions, train_probabilities, t, testtrain='train', fold=fold)
                self._trigger_score_one_fold(y_test[0],
                    test_predictions, test_probabilities, t, testtrain='test', fold=fold)

                # stack probas
                if fold == 0:
                    y_test_probabilities = test_probabilities
                    y_train_probabilities = train_probabilities
                else:
                    y_test_probabilities = np.vstack((y_test_probabilities, test_probabilities))
                    y_train_probabilities = np.vstack((y_train_probabilities, train_probabilities))

                y_train_true_timestep.extend(y_train_resampled[0])
                y_train_predict_timestep.extend(train_predictions)
                y_test_true_timestep.extend(y_test[0])
                y_test_predict_timestep.extend(test_predictions)
            # 4) trigger_score
            if self.verbose:
                ptf('Scoring timestep %d' % t, self.logfile)
            self._trigger_score_one_fold(y_train_true_timestep,
                y_train_predict_timestep, y_train_probabilities, t, 'train')
            self._trigger_score_one_fold(y_test_true_timestep,
                y_test_predict_timestep, y_test_probabilities, t, 'test')


        # if self.trigger_only:
        #     return X_featurized


        # 4A) Write avg tau to file for each trial.  We will pass this into triggered series model

        ## UPDATES FOR V2 BEYOND THIS ##

        # 5) tranform tau

        # for t in range(0, self.max_postdetection_time, 20):
        return X_featurized
Esempio n. 45
0
    def _regress(self, X):
        start = time.time()
        number_of_spots = X.iloc[0].shape[1]-1
        self.number_of_spots = number_of_spots
        Xp_ = X.copy()
        scores_ = X.apply(lambda x: np.zeros(number_of_spots))

        for trial_index, x in enumerate(X):
            if trial_index % 100 == 0:
                if self.verbose:
                    ptf( 'Taking derivatives of trial %d'%  trial_index, self.logfile)
            number_of_times = len(x)
            if self.maxmin:
                Xp = np.zeros((1, number_of_spots))
                if self.stacked:
                    Xp = np.zeros((2, number_of_spots))
            else:
                Xp = np.zeros((number_of_times, number_of_spots))
            trigger_times = np.zeros(number_of_spots)

            if self.stacked:
                trigger_times = np.zeros((2, number_of_spots))


            # print x.shape, number_of_times, number_of_spots, Xp.shape, scores.shape
            for column_index in np.arange(x.shape[1]):
                spot_index = column_index - 1
                if column_index == 0:
                    pass
                else:
                    score = 0
                    fp = x[:, column_index].reshape(-1,1)
                    # want the first AND second derivative
                    if self.stacked:
                        if self.order > 2:
                            print 'ERR - Not implemented for order >2'
                            return
                        if not self.maxmin:
                            print 'ERR - only works with maxmin'
                            return
                        fp, s = pade(fp, self.dx)
                        if self.gauss:
                            fp = gaussian_filter1d(fp, self.sigma)
                        fpp, s2 = pade(fp, self.dx)
                        if self.gauss:
                            fpp = gaussian_filter1d(fpp, self.sigma)
                        Xp[0, spot_index] = np.max(np.abs(fp))
                        Xp[1, spot_index] = np.max(np.abs(fpp))

                        trigger_times[0, spot_index] = x[np.argmax(np.abs(fp)), 0]
                        trigger_times[1, spot_index] = x[np.argmax(np.abs(fpp)), 0]

                    else:
                        # print 'about to derive', fp.shape
                        for dummy in range(self.order):
                            # print 'derive loop', fp.shape
                            if np.sum(np.isnan(fp)):
                                print 'Incoming error T:%s, S:%s, O:%s' % (trial_index, spot_index, dummy)
                            fp, s = pade(fp, self.dx)
                            if np.sum(np.isnan(fp)):
                                print 'PADE error T:%s, S:%s, O:%s' % (trial_index, spot_index, dummy)

                            if self.gauss:
                                fp = fp.T
                                fp = gaussian_filter1d(fp, self.sigma)
                                fp = fp.T
                                if np.sum(np.isnan(fp)):
                                    print 'Gauss error T:%s, S:%s, O:%s' % (trial_index, spot_index, dummy)
                            # print 'after derive', fp.shape, s.shape
                            # score += s
                        # print fp.shape, Xp[:, spot_index].shape
                        # return the trigger time as the other feature instead of a score
                        trigger_times[spot_index] = x[np.argmax(np.abs(fp)),0]
                        if self.maxmin:
                            Xp[0, spot_index] = np.max(np.abs(fp))
                        else:
                            Xp[:,spot_index] = fp.flatten()
                            Xp[:self.reference_time, spot_index] = 0
                    # scores[spot_index] = score
            Xp_.iloc[trial_index] = Xp
            scores_.iloc[trial_index] = trigger_times
        end = time.time()
        ptf( 'Regressed %d trials in %d seconds' % (len(X), (end-start)), self.logfile)
        return Xp_, scores_