Ejemplo n.º 1
0
    def _get_variable_data(self,
                           variable,
                           categorical,
                           label=None,
                           trend=None):
        ''' Extract (and cache) design matrix for variable/categorical combo.
        '''

        # assign default labels (for variables passed in via split_by or orthogonalize)
        if label is None:
            label = '_'.join(listify(variable))

        # hash labels (rather than variables)
        cache_key = hash((label, categorical))

        if cache_key not in self.cache:

            n_rows = len(self.dataset.activation)

            # Handle special cases
            if variable == 'intercept':
                dm = np.ones((n_rows, 1))
            elif variable in ['subject', 'run']:
                n_vols, n_runs = self.dataset.n_vols, self.dataset.n_runs
                n_grps = self.dataset.activation['subject'].nunique()
                if variable == 'run':
                    n_grps *= n_runs
                else:
                    n_vols *= n_runs
                dm = np.zeros((n_rows, n_grps))
                val = 1 if trend is None else standardize(
                    np.arange(n_vols)**trend)
                for i in range(n_grps):
                    dm[(n_vols * i):(n_vols * i + n_vols), i] = val
            else:
                run_dms = []
                events = self.events.copy()
                sr = 100  # Sampling rate, in Hz
                tr = self.dataset.TR
                scale = np.ceil(tr * sr)
                events['run_onset'] = (events['run_onset'] * sr).round()
                events['duration'] = (events['duration'] * sr).round()
                n_rows = int(np.ceil(self.dataset.n_vols * scale))

                if categorical:
                    variable_cols = events[variable]
                    if isinstance(variable, (list, tuple)):
                        variable_cols = variable_cols.stack()
                    n_cols = variable_cols.nunique()

                    # map unique values onto numerical indices, and return
                    # data as a DataFrame where each column is a (named) level
                    # of the variable
                    levels = variable_cols.unique()
                    mapping = OrderedDict(zip(levels, list(range(n_cols))))
                    if label is not None:
                        self.level_map[label] = mapping
                    events[variable] = events[variable].replace(mapping)

                else:
                    n_cols = 1

                for (sub_, run_), g in events.groupby(['subject', 'run']):
                    dm = np.zeros((n_rows, n_cols))
                    for i, row in g.iterrows():
                        start = int(row['run_onset'])
                        end = int(start + row['duration'])

                        if categorical:
                            for var in listify(variable):
                                dm[start:end,
                                   np.array(row[variable], dtype=int)] = 1
                        else:
                            if isinstance(variable, (tuple, list)):
                                raise ValueError(
                                    "Adding a list of terms is only "
                                    "supported for categorical variables "
                                    "(e.g., random factors).")
                            dm[start:end, 0] = row[variable]

                    dm = dm.reshape(-1, scale.astype(int), n_cols).mean(axis=1)
                    run_dms.append(dm[:self.dataset.n_vols])

                dm = np.concatenate(run_dms)

            self.cache[cache_key] = dm

        dm = self.cache[cache_key]

        # NOTE: we return a copy in order to avoid in-place changes to the
        # cached design matrix (e.g., we don't want the HRF convolution to
        # overwrite what's in the cache).
        return dm.copy()
Ejemplo n.º 2
0
    def _get_variable_data(self, variable, categorical, label=None, trend=None):
        ''' Extract (and cache) design matrix for variable/categorical combo.
        '''

        # assign default labels (for variables passed in via split_by or orthogonalize)
        if label is None:
            label = '_'.join(listify(variable))

        # hash labels (rather than variables)
        cache_key = hash((label, categorical))

        if cache_key not in self.cache:

            n_rows = len(self.dataset.activation)

            # Handle special cases
            if variable == 'intercept':
                dm = np.ones((n_rows, 1))
            elif variable in ['subject', 'run']:
                n_vols, n_runs = self.dataset.n_vols, self.dataset.n_runs
                n_grps = self.dataset.activation['subject'].nunique()
                if variable == 'run':
                    n_grps *= n_runs
                else:
                    n_vols *= n_runs
                dm = np.zeros((n_rows, n_grps))
                val = 1 if trend is None else standardize(
                    np.arange(n_vols)**trend)
                for i in range(n_grps):
                    dm[(n_vols*i):(n_vols*i+n_vols), i] = val
            else:
                run_dms = []
                events = self.events.copy()
                sr = 100  # Sampling rate, in Hz
                events['run_onset'] = (events['run_onset'] * sr).round()
                events['duration'] = (events['duration'] * sr).round()
                tr = self.dataset.TR
                scale = np.ceil(tr * sr)
                n_rows = self.dataset.n_vols * scale

                if categorical:
                    variable_cols = events[variable]
                    if isinstance(variable, (list, tuple)):
                        variable_cols = variable_cols.stack()
                    n_cols = variable_cols.nunique()

                    # map unique values onto numerical indices, and return
                    # data as a DataFrame where each column is a (named) level
                    # of the variable
                    levels = variable_cols.unique()
                    mapping = OrderedDict(zip(levels, list(range(n_cols))))
                    if label is not None:
                        self.level_map[label] = mapping
                    events[variable] = events[variable].replace(mapping)

                else:
                    n_cols = 1

                for (sub_, run_), g in events.groupby(['subject', 'run']):
                    dm = np.zeros((n_rows, n_cols))
                    for i, row in g.iterrows():
                        start = int(row['run_onset'])
                        end = int(start + row['duration'])

                        if categorical:
                            for var in listify(variable):
                                dm[start:end, row[variable]] = 1
                        else:
                            if isinstance(variable, (tuple, list)):
                                raise ValueError("Adding a list of terms is only "
                                        "supported for categorical variables "
                                        "(e.g., random factors).")
                            dm[start:end, 0] = row[variable]

                    dm = dm.reshape(-1, scale, n_cols).mean(axis=1)
                    run_dms.append(dm[:self.dataset.n_vols])

                dm = np.concatenate(run_dms)

            self.cache[cache_key] = dm

        dm = self.cache[cache_key]

        # NOTE: we return a copy in order to avoid in-place changes to the
        # cached design matrix (e.g., we don't want the HRF convolution to
        # overwrite what's in the cache).
        return dm.copy()
Ejemplo n.º 3
0
    def add_term(self,
                 variable,
                 label=None,
                 categorical=False,
                 random=False,
                 split_by=None,
                 yoke_random_mean=False,
                 estimate_random_mean=False,
                 dist='Normal',
                 scale=None,
                 trend=None,
                 orthogonalize=None,
                 convolution=None,
                 conv_kws=None,
                 sigma_kws=None,
                 withhold=False,
                 plot=False,
                 **kwargs):
        '''
        Args:
            variable (str): name of the variable in the Dataset that contains
                the predictor data for the term, or a list of variable names.
            label (str): short name/label of the term; will be used as the
                name passed to PyMC. If None, the variable name is used.
            categorical (bool): if False, treat the input data as continuous;
                if True, treats input as categorical, and assigns discrete
                levels to different columns in the predictor matrix
            random (bool): if False, model as fixed effect; if True, model as
                random effect
            split_by (str): optional name of another variable on which to split
                the target variable. A separate hyperparameter will be included
                for each level in the split_by variable. E.g., if variable = 
                'stimulus' and split_by = 'category', the model will include
                one parameter for each individual stimulus, plus C additional
                hyperparameters for the stimulus variances (one per category).
            yoke_random_mean (bool):
            estimate_random_mean (bool): If False (default), set mean of random
                effect distribution to 0. If True, estimate mean parameters for 
                each level of split_by (in which case the corresponding fixed
                effect parameter should be omitted, for identifiability reasons).
                If split_by=None, this is equivalent to estimating a fixed
                intercept term. Note that models parameterized in this way are
                often less numerically stable than the default parameterization.
            dist (str, Distribution): the PyMC3 distribution to use for the
                prior. Can be either a string (must be the name of a class in
                pymc3.distributions), or an uninitialized Distribution object.
            scale (str, bool): if 'before', scaling will be applied before
                convolving with the HRF. If 'after', scaling will be applied to
                the convolved regressor. True is treated like 'before'. If
                None (default), no scaling is done.
            trend (int): if variable is 'subject' or 'run', passing an int here
                will result in addition of an Nth-order polynomial trend
                instead of the expected intercept. E.g., when variable = 
                'run' and trend = 1, a linear trend will be added for each run.
            orthogonalize (list): list of variables to orthogonalize the target
                variable with respect to. For now, this only works for
                categorical covariates. E.g., if variable = 'condition' and
                orthogonalize = ['stimulus_category'], each level of condition
                will be residualized on all (binarized) levels of stimulus
                condition.
            convolution (str): the name of the convolution function to apply
                to the input data; must be a valid function in convolutions.py.
                If None, the default convolution function set at class
                initialization is used. If 'none' is passed, no convolution
                at all is applied.
            conv_kws (dict): optional dictionary of additional keyword
                arguments to pass onto the selected convolution function.
            sigma_kws (dict): optional dictionary of keyword arguments
                specifying the parameters of the Distribution to use as the
                sigma for a random variable. Defaults to HalfCauchy with
                beta=10. Ignored unless random=True.
            withhold (bool): if True, the PyMC distribution(s) will be created
                but not added to the prediction equation. This is useful when,
                e.g., yoking the mean of one distribution to the estimated
                value of another distribution, without including the same
                quantity twice.
            plot (bool): if True, plots the resulting design matrix component.
            kwargs: optional keyword arguments passed onto the selected PyMC3
                Distribution.
        '''

        if label is None:
            label = '_'.join(listify(variable))

        # Load design matrix for requested variable
        dm = self._get_variable_data(variable,
                                     categorical,
                                     label=label,
                                     trend=trend)
        n_cols = dm.shape[1]

        # Handle random effects with nesting/crossing. Basically this splits the design
        # matrix into a separate matrix for each level of split_by, stacked into 3D array
        if split_by is not None:
            split_dm = self._get_variable_data(split_by, True)
            dm = np.einsum('ab,ac->abc', dm, split_dm)

        # Orthogonalization
        # TODO: generalize this to handle any combination of settings; right
        # now it will only work properly when both the target variable and the
        # covariates are categorical fixed effects.
        if orthogonalize is not None:
            dm = self._orthogonalize(dm, orthogonalize)

        # Scaling and HRF: apply over last dimension
        # if there is no split_by, add a dummy 3rd dimension so code below works in general
        if dm.ndim == 2:
            dm = dm[..., None]

        if plot and plot != 'convolved':
            self.plot_design_matrix(dm, variable, split_by)

        for i in range(dm.shape[-1]):

            if scale and scale != 'after':
                dm[..., i] = standardize(dm[..., i])

            # Convolve with HRF
            if variable not in ['intercept'] and convolution is not 'none':
                if convolution is None:
                    convolution = self.convolution
                elif not hasattr(convolution, 'shape'):
                    convolution = get_convolution(convolution, conv_kws)

                # Convolve each run separately
                n_vols = self.dataset.n_vols
                n_runs = int(len(dm) / n_vols)
                for r in range(n_runs):
                    start, end = r * n_vols, (r * n_vols) + n_vols
                    _convolved = self._convolve(dm[start:end, :, i],
                                                convolution)
                    dm[start:end, :, i] = _convolved  # np.squeeze(_convolved)

            if scale == 'after':
                dm[..., i] = standardize(dm[..., i])

        if plot and plot == 'convolved':
            self.plot_design_matrix(dm, variable, split_by)

        # remove the dummy 3rd dimension if it was added prior to scaling/convolution
        if dm.shape[-1] == 1:
            dm = dm.reshape(dm.shape[:2])

        with self.model:

            # Random effects
            if random:
                # User can pass sigma specification in sigma_kws.
                # If not provided, default to HalfCauchy with beta = 10.
                if sigma_kws is None:
                    sigma_kws = {'dist': 'HalfCauchy', 'beta': 1}

                if split_by is None:
                    sigma = self._build_dist('sigma_' + label, **sigma_kws)
                    if estimate_random_mean:
                        mu = self._build_dist('b_' + label, dist)
                    else:
                        mu = 0.
                    u = self._build_dist('u_' + label,
                                         dist,
                                         mu=mu,
                                         sd=sigma,
                                         shape=n_cols,
                                         **kwargs)
                    self.mu += pm.dot(dm, u)
                else:
                    # id_map is essentially a crosstab except each cell is either 0 or 1
                    id_map = self._get_membership_graph(variable, split_by)
                    for i in range(id_map.shape[1]):
                        # select just the factor levels that appear with the
                        # current level of split_by
                        group_items = id_map.iloc[:, i].astype(bool)
                        selected = dm[:, group_items.values, i]
                        # add the level effects to the model
                        name = '%s_%s' % (label, id_map.columns[i])
                        sigma = self._build_dist('sigma_' + name, **sigma_kws)
                        if yoke_random_mean:
                            mu = self.dists['b_' + split_by][i]
                        elif estimate_random_mean:
                            mu = self._build_dist('b_' + name, dist)
                        else:
                            mu = 0.
                        name, size = 'u_' + name, selected.shape[1]
                        u = self._build_dist(name,
                                             dist,
                                             mu=mu,
                                             sd=sigma,
                                             shape=size,
                                             **kwargs)
                        self.mu += pm.dot(selected, u)

                        # Update the level map
                        levels = group_items[group_items].index.tolist()
                        self.level_map[name] = OrderedDict(
                            zip(levels, list(range(size))))

            # Fixed effects
            else:
                b = self._build_dist('b_' + label,
                                     dist,
                                     shape=dm.shape[-1],
                                     **kwargs)
                if split_by is not None:
                    dm = np.squeeze(dm)
                if not withhold:
                    self.mu += pm.dot(dm, b)
Ejemplo n.º 4
0
    def add_term(self, variable, label=None, categorical=False, random=False,
                 split_by=None, yoke_random_mean=False, estimate_random_mean=False,
                 dist='Normal', scale=None, trend=None, orthogonalize=None,
                 convolution=None, conv_kws=None, sigma_kws=None, withhold=False,
                 plot=False, **kwargs):
        '''
        Args:
            variable (str): name of the variable in the Dataset that contains
                the predictor data for the term, or a list of variable names.
            label (str): short name/label of the term; will be used as the
                name passed to PyMC. If None, the variable name is used.
            categorical (bool): if False, treat the input data as continuous;
                if True, treats input as categorical, and assigns discrete
                levels to different columns in the predictor matrix
            random (bool): if False, model as fixed effect; if True, model as
                random effect
            split_by (str): optional name of another variable on which to split
                the target variable. A separate hyperparameter will be included
                for each level in the split_by variable. E.g., if variable = 
                'stimulus' and split_by = 'category', the model will include
                one parameter for each individual stimulus, plus C additional
                hyperparameters for the stimulus variances (one per category).
            yoke_random_mean (bool):
            estimate_random_mean (bool): If False (default), set mean of random
                effect distribution to 0. If True, estimate mean parameters for 
                each level of split_by (in which case the corresponding fixed
                effect parameter should be omitted, for identifiability reasons).
                If split_by=None, this is equivalent to estimating a fixed
                intercept term. Note that models parameterized in this way are
                often less numerically stable than the default parameterization.
            dist (str, Distribution): the PyMC3 distribution to use for the
                prior. Can be either a string (must be the name of a class in
                pymc3.distributions), or an uninitialized Distribution object.
            scale (str, bool): if 'before', scaling will be applied before
                convolving with the HRF. If 'after', scaling will be applied to
                the convolved regressor. True is treated like 'before'. If
                None (default), no scaling is done.
            trend (int): if variable is 'subject' or 'run', passing an int here
                will result in addition of an Nth-order polynomial trend
                instead of the expected intercept. E.g., when variable = 
                'run' and trend = 1, a linear trend will be added for each run.
            orthogonalize (list): list of variables to orthogonalize the target
                variable with respect to. For now, this only works for
                categorical covariates. E.g., if variable = 'condition' and
                orthogonalize = ['stimulus_category'], each level of condition
                will be residualized on all (binarized) levels of stimulus
                condition.
            convolution (str): the name of the convolution function to apply
                to the input data; must be a valid function in convolutions.py.
                If None, the default convolution function set at class
                initialization is used. If 'none' is passed, no convolution
                at all is applied.
            conv_kws (dict): optional dictionary of additional keyword
                arguments to pass onto the selected convolution function.
            sigma_kws (dict): optional dictionary of keyword arguments
                specifying the parameters of the Distribution to use as the
                sigma for a random variable. Defaults to HalfCauchy with
                beta=10. Ignored unless random=True.
            withhold (bool): if True, the PyMC distribution(s) will be created
                but not added to the prediction equation. This is useful when,
                e.g., yoking the mean of one distribution to the estimated
                value of another distribution, without including the same
                quantity twice.
            plot (bool): if True, plots the resulting design matrix component.
            kwargs: optional keyword arguments passed onto the selected PyMC3
                Distribution.
        '''

        if label is None:
            label = '_'.join(listify(variable))

        # Load design matrix for requested variable
        dm = self._get_variable_data(variable, categorical, label=label,
                                     trend=trend)
        n_cols = dm.shape[1]

        # Handle random effects with nesting/crossing. Basically this splits the design
        # matrix into a separate matrix for each level of split_by, stacked into 3D array
        if split_by is not None:
            split_dm = self._get_variable_data(split_by, True)
            dm = np.einsum('ab,ac->abc', dm, split_dm)

        # Orthogonalization
        # TODO: generalize this to handle any combination of settings; right
        # now it will only work properly when both the target variable and the
        # covariates are categorical fixed effects.
        if orthogonalize is not None:
            dm = self._orthogonalize(dm, orthogonalize)

        # Scaling and HRF: apply over last dimension
        # if there is no split_by, add a dummy 3rd dimension so code below works in general
        if dm.ndim == 2:
            dm = dm[..., None]

        for i in range(dm.shape[-1]):

            if scale and scale != 'after':
                dm[..., i] = standardize(dm[..., i])

        if plot:
            self.plot_design_matrix(dm, variable, split_by)

            # Convolve with HRF
            if variable not in ['subject', 'run', 'intercept'] and convolution is not 'none':
                if convolution is None:
                    convolution = self.convolution
                elif not hasattr(convolution, 'shape'):
                    convolution = get_convolution(convolution, conv_kws)

                _convolved = self._convolve(dm[..., i], convolution)
                dm[..., i] = _convolved  # np.squeeze(_convolved)

            if scale == 'after':
                dm[..., i] = standardize(dm[..., i])

        # remove the dummy 3rd dimension if it was added prior to scaling/convolution
        if dm.shape[-1] == 1:
            dm = dm.reshape(dm.shape[:2])

        with self.model:

            # Random effects
            if random:

                # User can pass sigma specification in sigma_kws.
                # If not provided, default to HalfCauchy with beta = 10.
                if sigma_kws is None:
                    sigma_kws = {'dist': 'HalfCauchy', 'beta': 10}

                if split_by is None:
                    sigma = self._build_dist('sigma_' + label, **sigma_kws)
                    if estimate_random_mean:
                        mu = self._build_dist('b_' + label, dist)
                    else:
                        mu = 0.
                    u = self._build_dist('u_' + label, dist, mu=mu, sd=sigma,
                                         shape=n_cols, **kwargs)
                    self.mu += pm.dot(dm, u)
                else:
                    # id_map is essentially a crosstab except each cell is either 0 or 1
                    id_map = self._get_membership_graph(variable, split_by)
                    for i in range(id_map.shape[1]):
                        # select just the factor levels that appear with the
                        # current level of split_by
                        group_items = id_map.iloc[:, i].astype(bool)
                        selected = dm[:, group_items.values, i]
                        # add the level effects to the model
                        name = '%s_%s' % (label, id_map.columns[i])
                        sigma = self._build_dist('sigma_' + name, **sigma_kws)
                        if yoke_random_mean:
                            mu = self.dists['b_' + split_by][i]
                        elif estimate_random_mean:
                            mu = self._build_dist('b_' + name, dist)
                        else:
                            mu = 0.
                        name, size = 'u_' + name, selected.shape[1]
                        u = self._build_dist(name, dist, mu=mu, sd=sigma,
                                             shape=size, **kwargs)
                        self.mu += pm.dot(selected, u)

                        # Update the level map
                        levels = group_items[group_items].index.tolist()
                        self.level_map[name] = OrderedDict(zip(levels, list(range(size))))

            # Fixed effects
            else:
                b = self._build_dist('b_' + label, dist, shape=dm.shape[-1],
                                         **kwargs)
                if split_by is not None:
                    dm = np.squeeze(dm)
                if not withhold:
                    self.mu += pm.dot(dm, b)
 def predict(self, prediction: np.ndarray):
     train_ozturk = np.sort(self.__beta_reduction(prediction), axis=1)
     detrended = standardize(train_ozturk, axis=1)
     u, v = self.__ozturk_function(detrended)
     return u, v
def main(plot):
    hungary_csv = 'processed.filled.hungarian.csv'
    swiss_csv = 'processed.filled.switzerland.csv'

    df_hungary = pd.read_csv(hungary_csv)
    df_hungary.columns = [
        'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
        'exang', 'oldpeak', 'num'
    ]

    df_swiss = pd.read_csv(swiss_csv)
    df_swiss.columns = [
        'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
        'exang', 'oldpeak', 'slope', 'num'
    ]

    X_hungary = df_hungary.iloc[:, :-1].values
    Y_hungary = df_hungary.iloc[:, -1].values

    X_swiss = df_swiss.iloc[:, :-1].values
    Y_swiss = df_swiss.iloc[:, -1].values

    # Map swiss dataset labels to binary
    df_swiss['num'] = df_swiss.num.map({0: 0, 1: 1, 2: 1, 3: 1, 4: 1})
    df_swiss['sex'] = df_swiss.sex.map({0: 'female', 1: 'male'})

    df_hungary['sex'] = df_hungary.sex.map({0: 'female', 1: 'male'})

    ## split into training and testing
    ### Create plots for summary statistics of the dataset
    if (plot):
        plot_summary_statistics(df_hungary)
        plot_summary_statistics(df_swiss)

    x_train_hung, x_test_hung, y_train_hung, y_test_hung = train_test_split(
        X_hungary,
        Y_hungary,
        test_size=0.2,
        random_state=0,
        stratify=Y_hungary)
    x_train_swiss, x_test_swiss, y_train_swiss, y_test_swiss = train_test_split(
        X_swiss, Y_swiss, test_size=0.2, random_state=0, stratify=Y_swiss)

    ## Standardize data
    standardize_scaler = standardize()
    x_train_hung = standardize_scaler.fit_transform(x_train_hung)
    x_test_hung = standardize_scaler.transform(x_test_hung)

    x_train_swiss = standardize_scaler.fit_transform(x_train_swiss)
    x_test_swiss = standardize_scaler.transform(x_test_swiss)

    criterion = ['gini', 'entropy']
    c = [0.1, 0.5, 1, 5, 10]
    kernel = ('linear', 'poly', 'rbf', 'sigmoid')
    gamma = ('auto', 0.0001, 0.001, 0.01, 1, 'scale')
    degree = (1, 2, 3, 4, 5)
    coef0 = (0, 0.0001, 0.001, 0.01, 1)
    penalty = ['l1', 'l2']
    learning_rate = [0.05, 0.01, 0.3, 0.5]
    max_depth = [3, 5, 7, 10]

    print("Performing gridsearch for optimal SVM hyperparameters...")

    parameters = {
        'C': c,
        'kernel': kernel,
        'degree': degree,
        'coef0': coef0,
        'gamma': gamma
    }

    ## Train and Predict with SVM

    init_svm = SVC()
    hung_svm_clf = GridSearchCV(init_svm, parameters, cv=3, iid=True)
    hung_svm_clf = hung_svm_clf.fit(x_train_hung, y_train_hung)
    hung_svm_test_pred = hung_svm_clf.predict(x_test_hung)
    print(hung_svm_clf.best_estimator_)
    print("Testing accuracy of SVM over Hungary data: " +
          str(accuracy_score(y_test_hung, hung_svm_test_pred)))
    print("Testing precision of SVM over Hungary data: " +
          str(metrics.precision_score(y_test_hung, hung_svm_test_pred)))
    print("Testing recall of SVM over Hungary data: " +
          str(metrics.recall_score(y_test_hung, hung_svm_test_pred)))
    print("Testing F-measure of SVM over Hungary data: " +
          str(metrics.f1_score(y_test_hung, hung_svm_test_pred)))
    print()
    swiss_svm = SVC()
    swiss_svm_clf = GridSearchCV(swiss_svm, parameters, cv=3, iid=True)
    swiss_svm_clf = swiss_svm_clf.fit(x_train_swiss, y_train_swiss)
    swiss_svm_predictions = swiss_svm_clf.predict(x_test_swiss)
    print("Testing accuracy of SVM over Swiss data: " +
          str(accuracy_score(y_test_swiss, swiss_svm_predictions)))
    print("Testing precision of SVM over Swiss data: " +
          str(metrics.precision_score(y_test_swiss, swiss_svm_predictions)))
    print("Testing recall of SVM over Swiss data: " +
          str(metrics.recall_score(y_test_swiss, swiss_svm_predictions)))
    print("Testing F-measure of SVM over Swiss data: " +
          str(metrics.f1_score(y_test_swiss, swiss_svm_predictions)))
    print()
    # Train and predict with Logistic Regressin
    print(
        "Performing gridsearch for optimal Logistic Regression hyperparameters..."
    )

    parameters = {'C': c, 'penalty': penalty}
    lr_hung = LogisticRegression(solver='liblinear')
    lr_swiss = LogisticRegression(solver='liblinear')
    lr_hung_clf = GridSearchCV(lr_hung, parameters, cv=3, iid=True)
    # Fit hungary model
    lr_hung_clf.fit(x_train_hung, y_train_hung)
    predictions_lr_hung = lr_hung_clf.predict(x_test_hung)
    print(lr_hung_clf.best_estimator_)

    print("Testing accuracy of Logistic Regression over Hungary data: " +
          str(accuracy_score(y_test_hung, predictions_lr_hung)))
    print("Testing precision of LR over Hungary data: " +
          str(metrics.precision_score(y_test_hung, predictions_lr_hung)))
    print("Testing recall of LR over Hungary data: " +
          str(metrics.recall_score(y_test_hung, predictions_lr_hung)))
    print("Testing F-measure of LR over Hungary data: " +
          str(metrics.f1_score(y_test_hung, predictions_lr_hung)))
    print()
    lr_swiss_clf = GridSearchCV(lr_swiss, parameters, cv=3, iid=True)
    #Fit swiss model
    lr_swiss_clf.fit(x_train_swiss, y_train_swiss)
    predictions_lr_swiss = lr_swiss_clf.predict(x_test_swiss)
    print("Testing accuracy of Logistic Regression over Swiss data: " +
          str(accuracy_score(y_test_swiss, predictions_lr_swiss)))
    print("Testing precision of Logistic Regression over Swiss data: " +
          str(metrics.precision_score(y_test_swiss, predictions_lr_swiss)))
    print("Testing recall  of Logistic Regression over Swiss data: " +
          str(metrics.recall_score(y_test_swiss, predictions_lr_swiss)))
    print("Testing F-measure  of Logistic Regression over Swiss data: " +
          str(metrics.f1_score(y_test_swiss, predictions_lr_swiss)))
    print()
    print("Performing gridsearch for optimal Decision Tree hyperparameters...")
    # Train and predict with Decision Tree
    parameters = {'criterion': criterion}
    hung_dt = DecisionTreeClassifier()
    hung_dt_clf = GridSearchCV(hung_dt, parameters, cv=3, iid=True)
    # Fit model
    hung_dt_clf.fit(x_train_hung, y_train_hung)
    prediction_hung_dt = hung_dt_clf.predict(x_test_hung)
    print(hung_dt_clf.best_estimator_)

    print("Testing accuracy of Decision Tree over Hungary data: " +
          str(accuracy_score(y_test_hung, prediction_hung_dt)))
    print("Testing precision of Decision Tree over Hungary data: " +
          str(metrics.precision_score(y_test_hung, prediction_hung_dt)))
    print("Testing recall of Decision Tree over Hungary data: " +
          str(metrics.recall_score(y_test_hung, prediction_hung_dt)))
    print("Testing F-measure of Decision Tree over Hungary data: " +
          str(metrics.f1_score(y_test_hung, prediction_hung_dt)))
    print()
    swiss_dt = DecisionTreeClassifier()
    swiss_dt_clf = GridSearchCV(swiss_dt, parameters, cv=3, iid=True)
    #Fit Model
    swiss_dt_clf.fit(x_train_swiss, y_train_swiss)
    prediction_swiss_dt = swiss_dt_clf.predict(x_test_swiss)
    print("Testing accuracy of Decision Tree over Swiss data: " +
          str(accuracy_score(y_test_swiss, prediction_swiss_dt)))
    print("Testing precision of Decision Tree over Swiss data: " +
          str(metrics.precision_score(y_test_swiss, prediction_swiss_dt)))
    print("Testing recall of Decision Tree over Swiss data: " +
          str(metrics.recall_score(y_test_swiss, prediction_swiss_dt)))
    print("Testing F-measure of Decision Tree over Swiss data: " +
          str(metrics.f1_score(y_test_swiss, prediction_swiss_dt)))
    print()
    print("Performing gridsearch for optimal XGBoost hyperparameters...")
    ## Train and predict with XGBoost
    parameters = {'learning_rate': learning_rate, 'max_depth': max_depth}

    hung_xg = XGBClassifier()

    hung_xg_clf = GridSearchCV(hung_xg, parameters, cv=3, iid=True)
    hung_xg_clf.fit(x_train_hung, y_train_hung)
    predictions_hung_xg = hung_xg_clf.predict(x_test_hung)
    print(hung_xg_clf.best_estimator_)

    print("Testing accuracy of XGBoost over Hungary Data: " +
          str(accuracy_score(y_test_hung, predictions_hung_xg)))
    print("Testing precision of XGBoost over Hungary Data: " +
          str(metrics.precision_score(y_test_hung, predictions_hung_xg)))
    print("Testing recall of XGBoost over Hungary Data: " +
          str(metrics.recall_score(y_test_hung, predictions_hung_xg)))
    print("Testing F-measure of XGBoost over Hungary Data: " +
          str(metrics.f1_score(y_test_hung, predictions_hung_xg)))
    print()
    swiss_xg = XGBClassifier()
    swiss_xg_clf = GridSearchCV(swiss_xg, parameters, cv=3, iid=True)
    swiss_xg_clf.fit(x_train_swiss, y_train_swiss)
    predictions_swiss_xg = swiss_xg_clf.predict(x_test_swiss)
    print("Testing accuracy of XGBoost over Swiss Data: " +
          str(accuracy_score(y_test_swiss, predictions_swiss_xg)))
    print("Testing precision of XGBoost over Swiss Data: " +
          str(metrics.precision_score(y_test_swiss, predictions_swiss_xg)))
    print("Testing recall of XGBoost over Swiss Data: " +
          str(metrics.recall_score(y_test_swiss, predictions_swiss_xg)))
    print("Testing F-measure of XGBoost over Swiss Data: " +
          str(metrics.f1_score(y_test_swiss, predictions_swiss_xg)))
Ejemplo n.º 7
0
labels = np.array(h5["labels"])[idx]
mask = (labels.sum(axis=1) == 1)
labels = labels[mask].astype(int)
n_lab = labels.shape[1]

labels = (labels * np.arange(n_lab)[None, :]).sum(axis=1)
l_set = np.unique(labels)

cmap = plt.cm.gist_ncar
bounds = np.linspace(0, len(l_set), len(l_set) + 1)
ticks = [gen_d[i] for i in range(n_lab)]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

# FOR LOOP HERE

print(" -> loading data")
feat = np.array(h5["res50_avg"])[idx][mask]
print(" -> standardizing data")
normalize(feat, copy=False)
feat = standardize(feat)
print(" -> PCA-ing data")
feat = PCA(50).fit_transform(feat)
print(" -> TSNE-ing")
feat = TSNE(2).fit_transform(feat)

scat = plt.scatter(feat[:, 0], feat[:, 1], c=labels, cmap=cmap, norm=norm)
cb = plt.colorbar(scat, spacing='proportional', ticks=bounds)
cb.ax.set_yticklabels(ticks)  #weird, labens between colors...
plt.show()
 def __oa_hidden(self, distribution_sequence):
     train_ozturk = np.sort(self.__beta_reduction(distribution_sequence), axis=1)
     detrended = standardize(train_ozturk, axis=1)
     u, v = self.__ozturk_function(detrended)
     return u, v