Example #1
0
def cross_val_predict(estimator, X, y, cv=5, n_jobs=1, refit=False, predict_fun="predict"):
    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    pred = Parallel(n_jobs=n_jobs)(
        delayed(_cross_val_predict)(
            clone(estimator), X, y, train, test, predict_fun)
        for train, test in cv)
    pred = np.concatenate(pred)
    if cv.indices:
        index = np.concatenate([test for _, test in cv])
    else:
        index = np.concatenate([np.where(test)[0] for _, test in cv])
    ## pred[index] = pred doesn't work as expected
    pred[index] = pred.copy()
    if refit:
        return pred, clone(estimator).fit(X,y)
    else:
        return pred
Example #2
0
    #X_sg_n = Binarizer(copy=False).transform(v_sg.transform(X_sg_n_clean))

    X_pl_n_clean = preprocess.load_data('data/plural_n.txt', labels=False)
    X_pl_n = v_pl.transform(X_pl_n_clean)
    #X_pl_n = Binarizer(copy=False).transform(v_pl.transform(X_pl_n_clean))

    scores = []
    n_steps = 100
    print "size  \tratio\tsg_score\tpl_score\tscore   \tsg_std  \tpl_std  \tstd"
    for train_proportion in np.linspace(0.1, 1, 10):
        train_size = len(X_sg) * train_proportion
        steps = [
            shuffle(X_sg_p, y_sg, X_pl_p, y_pl, n_samples=train_size)
            for k in xrange(n_steps)
        ]
        step_scores = Parallel(n_jobs=-1, verbose=False)(
            delayed(nouns_score)(*step, X_sg_test=X_sg_n, X_pl_test=X_pl_n)
            for step in steps)
        step_scores = np.array(step_scores)

        score = np.r_[train_size, train_proportion,
                      step_scores.mean(axis=0),
                      step_scores.std(axis=0)]
        print "%d\t%.2f\t%.6f\t%.6f\t%.6f\t%.4e\t%.4e\t%.4e" % tuple(score)
        scores.append(score)
    print "Pickling scores..."

    scores = np.array(scores)
    plot(scores)
    np.save("train_size_i", scores)
Example #3
0
def _cpu_map(fun, param_grid, n_jobs, verbose=True):
    return Parallel(
        n_jobs=n_jobs,
        verbose=verbose,
        backend="threading",  # any sklearn backend should work here
    )(delayed(fun)(params) for params in param_grid)
Example #4
0
def mean_img(imgs, target_affine=None, target_shape=None,
             verbose=0, n_jobs=1):
    """ Compute the mean of the images (in the time dimension of 4th dimension)

    Note that if list of 4D images are given, the mean of each 4D image is
    computed separately, and the resulting mean is computed after.

    Parameters
    ==========

    imgs: Niimg-like object or iterable of Niimg-like objects
        See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg.
        Images to mean.

    target_affine: numpy.ndarray, optional
        If specified, the image is resampled corresponding to this new affine.
        target_affine can be a 3x3 or a 4x4 matrix

    target_shape: tuple or list, optional
        If specified, the image will be resized to match this new shape.
        len(target_shape) must be equal to 3.
        A target_affine has to be specified jointly with target_shape.

    verbose: int, optional
        Controls the amount of verbosity: higher numbers give
        more messages (0 means no messages).

    n_jobs: integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    Returns
    =======
    mean: nibabel.Nifti1Image
        mean image

    """
    if (isinstance(imgs, _basestring) or
            not isinstance(imgs, collections.Iterable)):
        imgs = [imgs, ]

    imgs_iter = iter(imgs)
    first_img = check_niimg(next(imgs_iter))

    # Compute the first mean to retrieve the reference
    # target_affine and target_shape if_needed
    n_imgs = 1
    running_mean, first_affine = _compute_mean(first_img,
                target_affine=target_affine,
                target_shape=target_shape)

    if target_affine is None or target_shape is None:
        target_affine = first_affine
        target_shape = running_mean.shape[:3]

    for this_mean in Parallel(n_jobs=n_jobs, verbose=verbose)(
            delayed(_compute_mean)(n, target_affine=target_affine,
                                   target_shape=target_shape)
            for n in imgs_iter):
        n_imgs += 1
        # _compute_mean returns (mean_img, affine)
        this_mean = this_mean[0]
        running_mean += this_mean

    running_mean = running_mean / float(n_imgs)
    return new_img_like(first_img, running_mean, target_affine)
Example #5
0
def data_summary(table_schema, table, fname, sample_size=1.0, sample_rows=100, output_root='', keep_images=False, n_jobs=1):
    """
    Summarize basic information of all columns in a data table
    based on the provided data schema

    Parameters
    ----------
    table_schema: pandas DataFrame
        schema of the table, should contain data types of each column
    table: pandas DataFrame
        the data table
    fname: string
        the output file name
    sample_size: integer or float(<=1.0), default=1.0
        int: number of sample rows to do the summary (useful for large tables)
        float: sample size in percentage
    sample_rows: integer
        number of rows to get data samples
    output_root: string
        the root directory for the output file
    keep_images: boolean
        whether to keep all generated images
    n_jobs: int
        the number of jobs to run in parall
    """

    # check sample_size
    if sample_size > 1:
        if int(sample_size) != sample_size:
            raise ValueError('sample_size: only accept integer when it is > 1.0')
        if sample_size > table.shape[0]:
            print("sample_size: %d is larger than the data size: %d" % (sample_size, table.shape[0]))

    # check output_root
    if output_root != '':
        if not os.path.isdir(output_root):
            raise ValueError('output_root: root not exists')

    # get data samples before sample_size
    data_sample = table.sample(sample_rows).reset_index(drop=True)

    # calculate the sample size
    if sample_size <= 1.0:
        sample_size = int(table.shape[0] * sample_size)
    if sample_size < table.shape[0]:
        table = table.sample(sample_size).reset_index(drop=True)

    exclude_features, check_features = _check_features(table_schema)

    # temp dir to store all the images generated
    img_dir = 'img_temp'
    if os.path.isdir(img_dir):
        shutil.rmtree(img_dir)
    os.mkdir(img_dir)

    # create a new workbook to store everything
    wb = openpyxl.Workbook()

    # list of results
    all_results = []

    # key features
    key_features = check_features['key']
    if len(key_features) > 0:
        # get the check result
        _n_jobs = np.min([n_jobs, len(key_features)])
        key_results = Parallel(n_jobs=_n_jobs)(delayed(_check_string)(col, table[[col]]) for col in key_features)
        all_results += key_results

        ws = wb.create_sheet(title=u'key')
        # write the final result to work sheet
        _insert_string_results(key_results, ws, 25)

    # numeric features
    numeric_features = check_features['numeric']
    if len(numeric_features) > 0:
        # get the check result
        _n_jobs = np.min([n_jobs, len(numeric_features)])
        numeric_results = Parallel(n_jobs=_n_jobs)(
            delayed(_check_numeric)(col, table[[col]], img_dir) for col in numeric_features)
        all_results += numeric_results

        ws = wb.create_sheet(title=u'numeric')
        # write the final result to work sheet
        _insert_numeric_results(numeric_results, ws, 35, img_dir)

    # string features
    string_features = check_features['str']
    if len(string_features) > 0:
        _n_jobs = np.min([n_jobs, len(string_features)])
        string_results = Parallel(n_jobs=_n_jobs)(delayed(_check_string)(col, table[[col]]) for col in string_features)
        all_results += string_results

        ws = wb.create_sheet(title=u'string')
        # write the final result to work sheet
        _insert_string_results(string_results, ws, 25)

    # date features
    date_features = check_features['date']
    if len(date_features) > 0:
        # get the current time
        snapshot_date_now = str(datetime.datetime.now().date())
        for col in date_features:
            table['%s_numeric' %(col)] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime(table[col], errors='coerce')).astype('timedelta64[M]', errors='ignore')
        _n_jobs = np.min([n_jobs, len(date_features)])
        date_results = Parallel(n_jobs=_n_jobs)(
            delayed(_check_date)('%s_numeric' % col, table[['%s_numeric' % col, col]], img_dir) for col in date_features)
        all_results += date_results

        ws = wb.create_sheet(title=u'date')
        # write the final result to work sheet
        _insert_numeric_results(date_results, ws, 35, img_dir, date_flag=True)

    # write schema
    ws = wb['Sheet']
    ws.title = 'summary'
    out_schema = table_schema[['column', 'type']]
    out_schema['check'] = 'Ok'

    # output error features
    error_indices = []
    if len(exclude_features) > 0:
        out_schema['check'] = out_schema.apply(lambda x : 'exclude' if x['column'] in exclude_features else x['check'], axis=1)
        error_indices += list(out_schema[out_schema['column'].isin(exclude_features)].index.values)

    # tidy up the output
    error_msg_dict = {}
    correct_info = []
    for result in all_results:
        if 'error_msg' in result.keys():
            error_msg_dict[result['column']] = result['error_msg']
        else:
            if type(result['result_df']) == list:
                result_df = result['result_df'][0]
            else:
                result_df = result['result_df']
            info = pd.Series(result_df['value'].values, index=result_df['feature']).to_dict()
            correct_info.append(info)

    correct_info_df = pd.DataFrame(correct_info)
    out_schema = out_schema.merge(correct_info_df, on='column', how='left')

    if len(error_msg_dict) > 0:
        out_schema['check'] = out_schema.apply(lambda x : error_msg_dict[x['column']] if x['column'] in error_msg_dict.keys() else x['check'], axis=1)
        error_indices += list(out_schema[out_schema['column'].isin(error_msg_dict.keys())].index.values)

    # Check for non present columns
    for c in ['value_min', 'value_mean', 'value_median', 'value_max']:
        if c not in out_schema.columns:
            out_schema[c] = np.nan

    if 'date_min' in out_schema.columns.values:
        order_columns = ['column', 'type', 'check', 'sample_value', 'nan_rate', 'num_uni', 'value_min', 'value_mean',
                         'value_median', 'value_max', 'date_min', 'date_max']
    else:
        order_columns = ['column', 'type', 'check', 'sample_value', 'nan_rate', 'num_uni', 'value_min', 'value_mean',
                         'value_median', 'value_max']

    _ = _insert_df(out_schema[order_columns], ws, header=True)
    if len(error_indices) > 0:
        for idx in error_indices:
            ws['C%d' %(idx+2)].style = 'Bad'

    _adjust_ws(ws=ws, row_height=25)

    # write data samples
    ws = wb.create_sheet(title=u'sample')
    _ = _insert_df(data_sample, ws, header=True, head_color=True, bold_first_column=False)
    _adjust_ws(ws=ws, row_height=20)

    wb.save(filename=os.path.join(output_root, 'data_summary_%s.xlsx' %(fname)))

    # remove all temp images
    if not keep_images:
        shutil.rmtree(img_dir)
Example #6
0
    def fit(self, X=None, y=None):
        n_alpha_grid_points = 4

        self.error_fro_ = np.zeros((n_alpha_grid_points, self.n_grid_points))
        self.error_supp_ = np.zeros((n_alpha_grid_points, self.n_grid_points))
        self.error_fp_ = np.zeros((n_alpha_grid_points, self.n_grid_points))
        self.error_fn_ = np.zeros((n_alpha_grid_points, self.n_grid_points))

        self.grid_ = np.linspace(5, 200, self.n_grid_points)
        #self.grid_ = np.logspace(np.log10(2), np.log10(200), self.n_grid_points)
        if self.adj_type=='erdos-renyi':
            self.alphas_ = np.logspace(-2.3,np.log10(.025), n_alpha_grid_points)[::1]
            #self.alphas_ = np.linspace(0.95, 0.99, n_alpha_grid_points)[::-1]
        else:
            self.alphas_ = np.logspace(np.log(.15),np.log10(.4), n_alpha_grid_points)[::1]
        self.ks_ = []

        for aidx, alpha in enumerate(self.alphas_):
            if self.verbose:
                print ('at alpha {} ({}/{})'.format(
                    alpha,
                    aidx,
                    n_alpha_grid_points,
                ))

            # draw a new fixed graph for alpha
            cov, prec, adj = new_graph(self.n_features, alpha, adj_type=self.adj_type,random_sign=False,seed=1)    
            n_nonzero_prec = np.count_nonzero(np.triu(adj,1).flat)
            self.ks_.append(n_nonzero_prec)
            mcmc_prng = np.random.RandomState(2)    
            # cov, prec = _new_graph(self.n_features, alpha)
            # n_nonzero_prec = np.count_nonzero(prec.flat)
            # self.ks_.append(n_nonzero_prec)
            
            if self.verbose:
                print ('   Graph has {} nonzero entries'.format(n_nonzero_prec))

            for sidx, sample_grid in enumerate(self.grid_):
                n_samples = int(sample_grid * self.n_features)
                # Debugging
                # print alpha, n_samples
                
                # model selection (once)
                X = mvn(n_samples, self.n_features, cov,random_state=mcmc_prng)
                ms_estimator = clone(self.model_selection_estimator)
                ms_estimator.fit(X)
                lam = getattr(ms_estimator, self.penalty_)
                
                if self.verbose:
                    display_lam = lam
                    if isinstance(lam, np.ndarray):
                        display_lam = np.linalg.norm(lam)
                    print ('   ({}/{}), n_samples = {}, selected lambda = {}'.format(
                            sidx,
                            self.n_grid_points,
                            n_samples,
                            display_lam))

                # setup default trial estimator
                trial_estimator = QuicGraphLasso(lam=lam,
                                                 mode='default',
                                                 init_method='corrcoef')

                # estimate statistical power
                errors = Parallel(
                    n_jobs=self.n_jobs,
                    verbose=False,
                    backend='threading',
                    #max_nbytes=None,
                    #batch_size=1,
                )(
                    delayed(ae_trial)(
                        trial_estimator, n_samples, self.n_features, cov, adj, random_state=mcmc_prng
                    )
                    for nn in range(self.n_trials))

                error_fro, error_supp, error_fp, error_fn, _ = zip(*errors)
                self.error_fro_[aidx, sidx] = np.mean(error_fro)
                self.error_supp_[aidx, sidx] = np.mean(error_supp)
                self.error_fp_[aidx, sidx] = np.mean(error_fp)
                self.error_fn_[aidx, sidx] = np.mean(error_fn)

            if self.verbose:
                print ('Results at this row:')
                print ('   fro = {}'.format(self.error_fro_[aidx, :]))
                print ('   supp = {}'.format(self.error_supp_[aidx, :]))
                print ('   fp = {}'.format(self.error_fp_[aidx, :]))
                print ('   fn = {}'.format(self.error_fn_[aidx, :]))

        self.is_fitted = True
        return self
Example #7
0
def run_glm(Y, X, noise_model='ar1', bins=100, n_jobs=1, verbose=0):
    """ GLM fit for an fMRI data matrix

    Parameters
    ----------
    Y : array of shape (n_time_points, n_voxels)
        The fMRI data.

    X : array of shape (n_time_points, n_regressors)
        The design matrix.

    noise_model : {'ar1', 'ols'}, optional
        The temporal variance model. Defaults to 'ar1'.

    bins : int, optional
        Maximum number of discrete bins for the AR(1) coef histogram.

    n_jobs : int, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose : int, optional
        The verbosity level. Defaut is 0

    Returns
    -------
    labels : array of shape (n_voxels,),
        A map of values on voxels used to identify the corresponding model.

    results : dict,
        Keys correspond to the different labels values
        values are RegressionResults instances corresponding to the voxels.

    """
    acceptable_noise_models = ['ar1', 'ols']
    if noise_model not in acceptable_noise_models:
        raise ValueError(
            "Acceptable noise models are {0}. You provided 'noise_model={1}'".\
                format(acceptable_noise_models, noise_model))

    if Y.shape[0] != X.shape[0]:
        raise ValueError(
            'The number of rows of Y should match the number of rows of X.'
            ' You provided X with shape {0} and Y with shape {1}'.\
                format(X.shape, Y.shape))

    # Create the model
    ols_result = OLSModel(X).fit(Y)

    if noise_model == 'ar1':
        # compute and discretize the AR1 coefs
        ar1 = ((ols_result.resid[1:] * ols_result.resid[:-1]).sum(axis=0) /
               (ols_result.resid ** 2).sum(axis=0))
        del ols_result
        ar1 = (ar1 * bins).astype(np.int) * 1. / bins
        # Fit the AR model acccording to current AR(1) estimates
        results = {}
        labels = ar1
        # Parallelize by creating a job per ARModel
        vals = np.unique(ar1)
        ar_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
            delayed(_ar_model_fit)(X, val, Y[:, labels == val]) for val in vals)
        for val, result in zip(vals, ar_result):
            results[val] = result
        del vals
        del ar_result

    else:
        labels = np.zeros(Y.shape[1])
        results = {0.0: ols_result}

    return labels, results
Example #8
0
    def fit(self, X, y):
        """Fit estimators from the training set (X, y).

        Returns
        -------
        self : object
            Returns self.
        """

        if not isinstance(X, dict):
            raise ValueError("X has to be a dict")

        if self.base_estimator._estimator_type == "classifier":
            self.classes_ = np.unique(y)

        self.set_random_state()

        estimators = dict()
        for roi_id, x in X.items():
            estimator = clone(self.base_estimator)
            estimator.roi_id = roi_id
            if self.base_estimator._estimator_type == "searchlight_ensemble":
                estimator.set_params(process_mask_img=x[1])
            estimators[roi_id] = estimator

        if self.vote_graded:
            y_pred = {k: np.full(len(y), np.nan) for k in X.keys()}
            for f, (train_index, test_index) in enumerate(LeaveOneOut(len(y))):
                y_train = [y[i] for i in train_index]

                if self.base_estimator._estimator_type == "searchlight_ensemble":
                    estimators_fit = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                        delayed(_parallel_build_estimator)(e, [X[roi_id][0][i] for i in train_index], y_train)
                        for roi_id, e in estimators.items()
                    )
                    estimators_fit = {e.roi_id: e for e in estimators_fit}
                    y_pred_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                        delayed(_vote)(e, [X[roi_id][0][i] for i in test_index], False)
                        for roi_id, e in estimators_fit.items()
                    )
                else:
                    estimators_fit = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                        delayed(_parallel_build_estimator)(e, [X[roi_id][i] for i in train_index], y_train)
                        for roi_id, e in estimators.items()
                    )
                    estimators_fit = {e.roi_id: e for e in estimators_fit}
                    y_pred_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                        delayed(_vote)(e, [X[roi_id][i] for i in test_index], False)
                        for roi_id, e in estimators_fit.items()
                    )
                for i, roi_id in enumerate(X.keys()):
                    y_pred[roi_id][test_index] = y_pred_[i]

            self.vote_weighting = [np.mean(v == np.array(y)) for v in y_pred.values()]
            if not np.any(self.vote_weighting):
                self.vote_weighting = 1e-10 * np.ones(len(self.vote_weighting))
        else:
            self.vote_weighting = np.ones(len(X.keys())) / len(X.keys())

        if self.base_estimator._estimator_type == "searchlight_ensemble":
            estimators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                delayed(_parallel_build_estimator)(e, X[roi_id][0], y) for roi_id, e in estimators.items()
            )
        else:
            estimators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                delayed(_parallel_build_estimator)(e, X[roi_id], y) for roi_id, e in estimators.items()
            )

        self.estimators_ = {e.roi_id: e for e in estimators}

        return self
Example #9
0
    #     paradigm=paradigm, frametimes=frametimes,
    #     drift_model=drift_model, hrf_model=hrf_model)
    # ProgressReport().finish_dir(subject_output_dir)

    return dict(subject_id=subject_id,
                mask=mask_path,
                effects_maps=effects_maps,
                z_maps=z_maps,
                contrasts=contrasts)


# first level GLM
mem = Memory(os.path.join(output_dir, "cache_dir"))
n_jobs = min(n_jobs, len(subject_ids))
first_levels = Parallel(n_jobs=n_jobs)(
    delayed(mem.cache(do_subject_glm))(subject_id)
    for subject_id in subject_ids)

# run second-level GLM
group_zmaps = group_one_sample_t_test(
    [subject_data["mask"] for subject_data in first_levels],
    [subject_data["effects_maps"] for subject_data in first_levels],
    first_levels[0]["contrasts"],
    output_dir,
    threshold=2.)
plot_prob_atlas([zmap for zmap in group_zmaps.values() if "_minus_" in zmap],
                threshold=1.2,
                view_type="filled_contours")
plt.savefig("group_zmaps.png")
show()
Example #10
0
        s['lat'] = results['geometry']['location']['lat']
        s['lng'] = results['geometry']['location']['lng']
        return 'ok', s
    elif jsondict['status'] == 'OVER_QUERY_LIMIT':
        return 'keyIncrement', _get_coordinate(url, keys, keyI + 1,
                                               attempt_time)
    elif jsondict['status'] == 'ZERO_RESULTS':
        return 'zero', None
    elif jsondict['status'] == 'UNKNOWN_ERROR':
        return _get_coordinate(url, keys, keyI, attempt_time + 1)
    else:
        return 'keyError,parameterError', None


if __name__ == '__main__':
    adrsTable = pd.read_csv('fmtedAddress.csv')
    keys = []
    n_jobs = 10
    l = []

    try:
        table = Parallel(n_jobs=n_jobs)(
            delayed(task_distribute)(adrsTable.Address[adrsTable.lat.isnull(
            )].dropna().unique(), keys, start_index, n_jobs)
            for start_index in range(n_jobs))
        for s_list in table:
            l.extend(s_list)
    finally:
        if len(l) > 0:
            pd.concat(l, axis=1).T.to_csv('coordinate.csv', index=False)
Example #11
0
    def fit(self, X, y=None, groups=None, **fit_params):
        if self.fit_params is not None:
            warnings.warn(
                '"fit_params" as a constructor argument was '
                'deprecated in version 0.19 and will be removed '
                'in version 0.21. Pass fit parameters to the '
                '"fit" method instead.', DeprecationWarning)
            if fit_params:
                warnings.warn(
                    'Ignoring fit_params passed as a constructor '
                    'argument in favor of keyword arguments to '
                    'the "fit" method.', RuntimeWarning)
            else:
                fit_params = self.fit_params
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, six.string_types) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key "
                                 "to refit an estimator with the best "
                                 "parameter setting on the whole data and "
                                 "make the best_* attributes "
                                 "available for that metric. If this is not "
                                 "needed, refit should be set to False "
                                 "explicitly. %r was passed." % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        # Regenerate parameter iterable for each fit
        candidate_params = list(self._get_param_iterator())
        n_candidates = len(candidate_params)
        if self.verbose > 0:
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            pre_dispatch=pre_dispatch)(delayed(_fit_and_score)(
                clone(base_estimator),
                X,
                y,
                scorers,
                train,
                test,
                self.verbose,
                parameters,
                fit_params=fit_params,
                return_train_score=self.return_train_score,
                return_n_test_samples=True,
                return_times=True,
                return_parameters=False,
                error_score=self.error_score,
                return_estimator=True) for parameters, (
                    train,
                    test) in product(candidate_params, cv.split(X, y, groups)))

        n_candidates = len(candidate_params)
        n_folds = cv.get_n_splits()
        self.cv_estimators = []
        for i in range(n_candidates):
            current_slice = out[(i * n_folds):((i + 1) * n_folds)]
            self.cv_estimators.append(
                ('model_%d' % (i + 1),
                 [info[-1]['estimator'] for info in current_slice]))
        out = [info[:-1] for info in out]
        self.folds = list(cv.split(X, y, groups))

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
             score_time) = zip(*out)
        else:
            (test_score_dicts, test_sample_counts, fit_time,
             score_time) = zip(*out)

        # test_score_dicts and train_score dicts are lists of dictionaries and
        # we make them into dict of lists
        test_scores = _aggregate_score_dicts(test_score_dicts)
        if self.return_train_score:
            train_scores = _aggregate_score_dicts(train_score_dicts)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            # When iterated first by splits, then by parameters
            # We want `array` to have `n_candidates` rows and `n_splits` cols.
            array = np.array(array,
                             dtype=np.float64).reshape(n_candidates, n_splits)
            if splits:
                for split_i in range(n_splits):
                    # Uses closure to alter the results
                    results["split%d_%s" %
                            (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(
                np.average((array - array_means[:, np.newaxis])**2,
                           axis=1,
                           weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(rankdata(
                    -array_means, method='min'),
                                                           dtype=np.int32)

        _store('fit_time', fit_time)
        _store('score_time', score_time)
        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(
            partial(MaskedArray,
                    np.empty(n_candidates, ),
                    mask=True,
                    dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)
        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)
        for scorer_name in scorers.keys():
            # Computed the (weighted) mean and std for test scores alone
            _store('test_%s' % scorer_name,
                   test_scores[scorer_name],
                   splits=True,
                   rank=True,
                   weights=test_sample_counts if self.iid else None)
            if self.return_train_score:
                _store('train_%s' % scorer_name,
                       train_scores[scorer_name],
                       splits=True)

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
            self.best_params_ = candidate_params[self.best_index_]
            self.best_score_ = results["mean_test_%s" %
                                       refit_metric][self.best_index_]

        if self.refit:
            self.best_estimator_ = clone(base_estimator).set_params(
                **self.best_params_)
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
Example #12
0
    def fit(self, imgs, y=None, confounds=None):
        """Compute the mask and the components

        Parameters
        ----------
        imgs: list of Niimg-like objects
            See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg.
            Data on which the PCA must be calculated. If this is a list,
            the affine is considered the same for all.
        """

        # Hack to support single-subject data:
        if isinstance(imgs, (_basestring, nibabel.Nifti1Image)):
            imgs = [imgs]
            # This is a very incomplete hack, as it won't work right for
            # single-subject list of 3D filenames
        if len(imgs) == 0:
            # Common error that arises from a null glob. Capture
            # it early and raise a helpful message
            raise ValueError('Need one or more Niimg-like objects as input, '
                             'an empty list was given.')
        if confounds is None:
            confounds = itertools.repeat(None, len(imgs))

        # First, learn the mask
        if not isinstance(self.mask, (NiftiMasker, MultiNiftiMasker)):
            self.masker_ = MultiNiftiMasker(mask_img=self.mask,
                                            smoothing_fwhm=self.smoothing_fwhm,
                                            target_affine=self.target_affine,
                                            target_shape=self.target_shape,
                                            standardize=self.standardize,
                                            low_pass=self.low_pass,
                                            high_pass=self.high_pass,
                                            mask_strategy='epi',
                                            t_r=self.t_r,
                                            memory=self.memory,
                                            memory_level=self.memory_level,
                                            n_jobs=self.n_jobs,
                                            verbose=max(0, self.verbose - 1))
        else:
            try:
                self.masker_ = clone(self.mask)
            except TypeError as e:
                # Workaround for a joblib bug: in joblib 0.6, a Memory object
                # with cachedir = None cannot be cloned.
                masker_memory = self.mask.memory
                if masker_memory.cachedir is None:
                    self.mask.memory = None
                    self.masker_ = clone(self.mask)
                    self.mask.memory = masker_memory
                    self.masker_.memory = Memory(cachedir=None)
                else:
                    # The error was raised for another reason
                    raise e

            for param_name in [
                    'target_affine', 'target_shape', 'smoothing_fwhm',
                    'low_pass', 'high_pass', 't_r', 'memory', 'memory_level'
            ]:
                our_param = getattr(self, param_name)
                if our_param is None:
                    # Default value
                    continue
                if getattr(self.masker_, param_name) is not None:
                    warnings.warn('Parameter %s of the masker overriden' %
                                  param_name)
                setattr(self.masker_, param_name, our_param)

        # Masker warns if it has a mask_img and is passed
        # imgs to fit().  Avoid the warning by being careful
        # when calling fit.
        if self.masker_.mask_img is None:
            self.masker_.fit(imgs)
        else:
            self.masker_.fit()
        self.mask_img_ = self.masker_.mask_img_

        parameters = get_params(MultiNiftiMasker, self)
        # Remove non specific and redudent parameters
        for param_name in [
                'memory', 'memory_level', 'confounds', 'verbose', 'n_jobs'
        ]:
            parameters.pop(param_name, None)

        parameters['detrend'] = True

        # Now do the subject-level signal extraction (i.e. data-loading +
        # PCA)

        subject_pcas = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
            delayed(session_pca)(img,
                                 self.masker_.mask_img_,
                                 parameters,
                                 n_components=self.n_components,
                                 memory=self.memory,
                                 memory_level=self.memory_level,
                                 confounds=confound,
                                 verbose=self.verbose,
                                 random_state=self.random_state)
            for img, confound in zip(imgs, confounds))
        subject_pcas, subject_svd_vals = zip(*subject_pcas)

        if len(imgs) > 1:
            if not self.do_cca:
                for subject_pca, subject_svd_val in \
                        zip(subject_pcas, subject_svd_vals):
                    subject_pca *= subject_svd_val[:, np.newaxis]
            data = np.empty(
                (len(imgs) * self.n_components, subject_pcas[0].shape[1]),
                dtype=subject_pcas[0].dtype)
            for index, subject_pca in enumerate(subject_pcas):
                if self.n_components > subject_pca.shape[0]:
                    raise ValueError('You asked for %i components. '
                                     'This is larger than the single-subject '
                                     'data size (%d).' %
                                     (self.n_components, subject_pca.shape[0]))
                data[index * self.n_components:(index + 1) *
                     self.n_components] = subject_pca
            data, variance, _ = self._cache(randomized_svd,
                                            func_memory_level=3)(
                                                data.T,
                                                n_components=self.n_components,
                                                transpose=True,
                                                random_state=self.random_state)
            # as_ndarray is to get rid of memmapping
            data = as_ndarray(data.T)
        else:
            data = subject_pcas[0]
            variance = subject_svd_vals[0]
        self.components_ = data
        self.variance_ = variance
        return self
Example #13
0
        f1.append(f1_score(y_test, y_test_pred))

    return scores_train, scores_test, precision, f1


print('{:<13} {:<16} {:<13} {:<16} {:<13} {:<16} {:<13} {:<16} {:<}'.format(
    '~|Acc@Train', 'IQR|Acc@Train', '~|Acc@Test', 'IQR|Acc@Test',
    '~|Prec@Test', 'IQR|Prec@Test', '~|F1@Test', 'IQR|F1@Test', 'Config'))
for func, funcname in funcs:
    try:
        func.set_params(n_jobs=1)
    except Exception:
        pass

    result = Parallel(n_jobs=n_jobs, verbose=0)(delayed(parallel_fit)(
        func, seeds[seeds_per_job[i]:seeds_per_job[i + 1]], X, y, test_size)
                                                for i in range(n_jobs))
    scores_train, scores_test, precision, f1 = zip(*result)
    scores_train = list(itertools.chain.from_iterable(scores_train))
    scores_test = list(itertools.chain.from_iterable(scores_test))
    precision = list(itertools.chain.from_iterable(precision))
    f1 = list(itertools.chain.from_iterable(f1))
    if funcname is None:
        funcname = str(func)
        funcname = funcname[:funcname.find('(')]
    print(
        '{:<13.3f} {:<16.5f} {:<13.3f} {:<16.5f} {:<13.3f} {:<16.5f} {:<13.3f} {:<16.5f} {:<}'
        .format(np.median(scores_train),
                np.subtract(*np.percentile(scores_train, [75, 25])),
                np.median(scores_test),
                np.subtract(*np.percentile(scores_test, [75, 25])),
Example #14
0
    def fit(self, X, y):
        """Build a Bagging ensemble of estimators from the training
           set (X, y).

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        y : array-like, shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).


        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Convert data
        X, y = check_X_y(X, y, ['csr', 'csc'])

        # Remap output
        n_samples, self.n_features_ = X.shape
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator()

        if isinstance(self.max_samples, (numbers.Integral, np.integer)):
            max_samples = self.max_samples
        else:  # float
            max_samples = int(self.max_samples * X.shape[0])

        if not (0 < max_samples <= X.shape[0]):
            raise ValueError("max_samples must be in (0, n_samples]")

        if isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        if self.warm_start and self.oob_score:
            raise ValueError("Out of bag estimate only available"
                             " if warm_start=False")

        if hasattr(self, "oob_score_") and self.warm_start:
            del self.oob_score_

        if not self.warm_start or len(self.estimators_) == 0:
            # Free allocated memory, if any
            self.estimators_ = []
            self.estimators_samples_ = []
            self.estimators_features_ = []

        n_more_estimators = self.n_estimators - len(self.estimators_)

        if n_more_estimators < 0:
            raise ValueError('n_estimators=%d must be larger or equal to '
                             'len(estimators_)=%d when warm_start==True' %
                             (self.n_estimators, len(self.estimators_)))

        elif n_more_estimators == 0:
            warn("Warm-start fitting without increasing n_estimators does not "
                 "fit new trees.")
            return self

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(
            n_more_estimators, self.n_jobs)

        # Advance random state to state after training
        # the first n_estimators
        if self.warm_start and len(self.estimators_) > 0:
            random_state.randint(MAX_INT, size=len(self.estimators_))

        seeds = random_state.randint(MAX_INT, size=n_more_estimators)

        all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            # TEF: changed following call to balanced procedure:
            delayed(_parallel_build_balanced_estimators)(
                n_estimators[i],
                self,
                X,
                y,
                seeds[starts[i]:starts[i + 1]],
                verbose=self.verbose) for i in range(n_jobs))

        # Reduce
        self.estimators_ += list(
            itertools.chain.from_iterable(t[0] for t in all_results))
        self.estimators_samples_ += list(
            itertools.chain.from_iterable(t[1] for t in all_results))
        self.estimators_features_ += list(
            itertools.chain.from_iterable(t[2] for t in all_results))

        if self.oob_score:
            self._set_oob_score(X, y)

        return self
Example #15
0
    X_sg_p, v_sg = preprocess.preprocess_data(X_sg, suffix="$", n=5, return_vect=True, binarize=False)
    X_pl_p, v_pl = preprocess.preprocess_data(X_pl, suffix="$", n=5, return_vect=True, binarize=False)

    X_sg_n_clean = preprocess.load_data("data/singular_n.txt", labels=False)
    X_sg_n = v_sg.transform(X_sg_n_clean)
    # X_sg_n = Binarizer(copy=False).transform(v_sg.transform(X_sg_n_clean))

    X_pl_n_clean = preprocess.load_data("data/plural_n.txt", labels=False)
    X_pl_n = v_pl.transform(X_pl_n_clean)
    # X_pl_n = Binarizer(copy=False).transform(v_pl.transform(X_pl_n_clean))

    scores = []
    n_steps = 100
    print "size  \tratio\tsg_score\tpl_score\tscore   \tsg_std  \tpl_std  \tstd"
    for train_proportion in np.linspace(0.1, 1, 10):
        train_size = len(X_sg) * train_proportion
        steps = [shuffle(X_sg_p, y_sg, X_pl_p, y_pl, n_samples=train_size) for k in xrange(n_steps)]
        step_scores = Parallel(n_jobs=-1, verbose=False)(
            delayed(nouns_score)(*step, X_sg_test=X_sg_n, X_pl_test=X_pl_n) for step in steps
        )
        step_scores = np.array(step_scores)

        score = np.r_[train_size, train_proportion, step_scores.mean(axis=0), step_scores.std(axis=0)]
        print "%d\t%.2f\t%.6f\t%.6f\t%.6f\t%.4e\t%.4e\t%.4e" % tuple(score)
        scores.append(score)
    print "Pickling scores..."

    scores = np.array(scores)
    plot(scores)
    np.save("train_size_i", scores)
Example #16
0
    def fit(self, X, y=None):
        """Fit (estimates) the clusters.

        Parameters
        ----------
        X : ndarray, shape (n_trials, n_channels, n_channels)
            ndarray of SPD matrices.
        y : ndarray | None (default None)
            Not used, here for compatibility with sklearn API.

        Returns
        -------
        self : Kmeans instance
            The Kmean instance.
        """
        if (self.init is not 'random') | (self.n_init == 1):
            # no need to iterate if init is not random
            labels, inertia, mdm = _fit_single(X,
                                               y,
                                               n_clusters=self.n_clusters,
                                               init=self.init,
                                               random_state=self.seed,
                                               metric=self.metric,
                                               max_iter=self.max_iter,
                                               tol=self.tol,
                                               n_jobs=self.n_jobs)
        else:
            numpy.random.seed(self.seed)
            seeds = numpy.random.randint(numpy.iinfo(numpy.int32).max,
                                         size=self.n_init)
            if self.n_jobs == 1:
                res = []
                for i in range(self.n_init):
                    res = _fit_single(X,
                                      y,
                                      n_clusters=self.n_clusters,
                                      init=self.init,
                                      random_state=seeds[i],
                                      metric=self.metric,
                                      max_iter=self.max_iter,
                                      tol=self.tol)
                labels, inertia, mdm = zip(res)
            else:

                res = Parallel(n_jobs=self.n_jobs, verbose=0)(
                    delayed(_fit_single)(X,
                                         y,
                                         n_clusters=self.n_clusters,
                                         init=self.init,
                                         random_state=seed,
                                         metric=self.metric,
                                         max_iter=self.max_iter,
                                         tol=self.tol,
                                         n_jobs=1) for seed in seeds)
                labels, inertia, mdm = zip(*res)

            best = numpy.argmin(inertia)
            mdm = mdm[best]
            labels = labels[best]
            inertia = inertia[best]

        self.mdm_ = mdm
        self.inertia_ = inertia
        self.labels_ = labels

        return self
Example #17
0

if __name__ == "__main__":
    # classifier = BipartiteRankBoost(n_estimators=50, verbose=1)

    classifier = GradientBoostingClassifier(
        n_estimators=800, subsample=0.9, learning_rate=0.05, max_depth=3, random_state=1, verbose=0
    )

    feature_list = ["nauthors", "npapers", "year", "nattrib", "ncoauthor", "paperrank", "globalpaperrank", "nappear"]

    trainfeatures = loadFeatures(feature_list, mode="train")
    trainlabels = cPickle.load(open("labels.train", "rb"))

    cv_authors = KFold(len(trainlabels), n_folds=5, indices=True, shuffle=True, random_state=1)

    score = Parallel(n_jobs=-1)(
        delayed(crossValidation)(trainlabels, trainfeatures, classifier, train_authors, test_authors, pairwise=False)
        for train_authors, test_authors in cv_authors
    )

    score = np.array(score)
    print "score mean, std, mean-std:", score.mean(), score.std(), score.mean() - score.std()

    # testfeatures = loadFeatures(feature_list, mode='test')
    # testlabels = cPickle.load(open('labels.test', 'rb'))

    # trainAndPredict(trainlabels, trainfeatures, testlabels, testfeatures, classifier, pairwise=False)

    # shuffleCrossValidation(trainlabels, trainfeatures, classifier, n_iter=5, verbose=0, pairwise=False)
Example #18
0
def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1,
            pre_dispatch='2*n_jobs', confidence=-1.):
    return (m for c in Parallel(n_jobs=n_jobs, batch_size=1,
                                pre_dispatch=pre_dispatch)
            (delayed(_predict_chunk)(pipeline, separator, confidence, chunk)
             for chunk in _chunks(reads, chunk_size)) for m in c)
Example #19
0
# first setting
print("\nBinacox vs. Auto Cutoff computing times")
n_features = 1
n_cut_points = 2
cov_corr = .5
sparsity = .2
N_simu = 100
n_samples_grid = [300, 500, 1000, 2000, 4000]

result_ = pd.DataFrame(columns=["n_samples", "time_bina", "time_ac_all",
                                "time_ac_grid"])
for i, n_samples in enumerate(n_samples_grid):
    print("n_samples: %d/%d " % ((i + 1), len(n_samples_grid)))
    result_n = Parallel(n_jobs=10)(
        delayed(get_times1)(n_simu, n_samples, n_features,
                            n_cut_points)
        for n_simu in range(N_simu))
    result_n = pd.DataFrame(result_n,
                            columns=["n_samples", "time_bina", "time_ac_all",
                                     "time_ac_grid"])
    result_ = result_.append(result_n, ignore_index=True)

result = pd.DataFrame(columns=["n", "method", "time"])
tmp = pd.DataFrame(columns=["n", "method", "time"])
tmp.n = result_.n_samples
tmp.method = "Binacox"
tmp.time = result_.time_bina
result = result.append(tmp, ignore_index=True)

tmp.n = result_.n_samples
tmp.method = "AC all"
dir_imgs = r'F:\Avinash\Ablations & Behavior\RS neurons\M homologs\20190308\20190309_behavior\f3_abl_vibAmpOnly_amp_3\fastDir_03-14-19-065345'
headDiam = 1  # Approximate head diameter in mm (for determining head position by weighted average)

#%% Compute background image
print('Computing background...')
img_back = fsb.track.computeBackground(dir_imgs)

print('Estimating pixel size...')
pxlSize = fsb.getPxlSize(img_back)[0]

#%% Find fish position
imgNames = ft.findAndSortFilesInDir(dir_imgs, ext='bmp')
r = int(0.5 * headDiam / pxlSize)

print('Estimating fish position...')
from sklearn.externals.joblib import Parallel, delayed
from skimage.io import imread
fp = Parallel(n_jobs=32, verbose=1)(delayed(fsb.track.findFish)(
    imread(os.path.join(dir_imgs, imgName)), back_img=img_back, r=r)
                                    for imgName in imgNames)
fp = np.array(fp)

#%% Sanity check - Look at fish position trajectories
nFramesInTrl = 750
fp_trl = ft.sublistsFromList(fp, nFramesInTrl)
plt.figure(figsize=(16, 16))
plt.imshow(img_back, cmap='gray')
for trl, fp_ in enumerate(fp_trl):
    fp_ = np.array(fp_)
    plt.plot(fp_[:, 0], fp_[:, 1], '.-', markersize=4, color=plt.cm.tab20(trl))
Example #21
0
    def fit(self, X=None, y=None):
        n_alpha_grid_points = 4

        self.results_ = np.zeros((n_alpha_grid_points, self.n_grid_points))
        self.grid_ = np.logspace(0, np.log10(200), self.n_grid_points)
        if self.adj_type=='erdos-renyi':
            self.alphas_ = np.logspace(-2.3,np.log10(.025), n_alpha_grid_points)[::1]
        else:
            self.alphas_ = np.logspace(np.log(.1),np.log10(.3), n_alpha_grid_points)[::1]

        self.ks_ = []

        for aidx, alpha in enumerate(self.alphas_):
            if self.verbose:
                print ('at alpha {} ({}/{})'.format(
                    alpha,
                    aidx,
                    n_alpha_grid_points,
                ))
            
            # draw a new fixed graph for alpha
            cov, prec, adj = new_graph(self.n_features, alpha, adj_type=self.adj_type,random_sign=False,seed=1)    
            n_nonzero_prec = np.count_nonzero(np.triu(adj,1).flat)
            self.ks_.append(n_nonzero_prec)
            mcmc_prng = np.random.RandomState(2)
            if self.verbose:
                print ('   Graph has {} nonzero entries'.format(n_nonzero_prec))

            for sidx, sample_grid in enumerate(self.grid_):
                n_samples = int(sample_grid * self.n_features)
                # Debugging
                # print alpha, n_samples
                
                # model selection (once)
                X = mvn(n_samples, self.n_features, cov,random_state=mcmc_prng)
                ms_estimator = clone(self.model_selection_estimator)
                ms_estimator.fit(X)                
                lam = getattr(ms_estimator, self.penalty_)
                
                if self.verbose:
                    display_lam = lam
                    if isinstance(lam, np.ndarray):
                        display_lam = np.linalg.norm(lam)
                    print ('   ({}/{}), n_samples = {}, selected lambda = {}'.format(
                            sidx,
                            self.n_grid_points,
                            n_samples,
                            display_lam))

                # setup default trial estimator
                if self.trial_estimator is None:
                    trial_estimator = QuicGraphLasso(lam=lam,
                                                     mode='default',
                                                     init_method='corrcoef')
                elif self.trial_estimator == 'Adaptive':
                    trial_estimator = AdaptiveGraphLasso(estimator = QuicGraphLasso(lam=lam,mode='default',init_method='corrcoef'), 
                                                         method='inverse_squared')
                else:
                    trial_estimator = self.trial_estimator

                # patch trial estimator with this lambda
                if self.trial_estimator == 'Adaptive':
                    trial_estimator.estimator_.set_params(**{
                        self.penalty: lam, 
                    })
                else:
                    trial_estimator.set_params(**{
                        self.penalty: lam, 
                    })
                    

                # estimate statistical power
                exact_support_counts = Parallel(
                    n_jobs=self.n_jobs,
                    verbose=False,
                    backend='threading',
                    #max_nbytes=None,
                    #batch_size=1,
                )(
                    delayed(sp_trial)(
                        trial_estimator, n_samples, self.n_features, cov, adj, mcmc_prng
                    )
                    for nn in range(self.n_trials))

                self.results_[aidx, sidx] = 1. * np.sum(exact_support_counts) / self.n_trials

            if self.verbose:
                print ('Results at this row: {}'.format(self.results_[aidx, :]))

        self.is_fitted = True
        return self
Example #22
0
    def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
        """Perform feature selection and learn model from training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
            New in v 0.13.0: pandas DataFrames are now also accepted as
            argument for X.
        y : array-like, shape = [n_samples]
            Target values.
        custom_feature_names : None or tuple (default: tuple)
            Custom feature names for `self.k_feature_names` and
            `self.subsets_[i]['feature_names']`.
            (new in v 0.13.0)
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set. Passed to the fit method of the cross-validator.
        fit_params : dict of string -> object, optional
            Parameters to pass to to the fit method of classifier.

        Returns
        -------
        self : object

        """

        # reset from a potential previous fit run
        self.subsets_ = {}
        self.fitted = False
        self.interrupted_ = False
        self.best_idx_ = None
        self.best_feature_names_ = None
        self.best_score_ = None

        if hasattr(X, 'loc'):
            X_ = X.values
        else:
            X_ = X

        if (custom_feature_names is not None
                and len(custom_feature_names) != X.shape[1]):
            raise ValueError('If custom_feature_names is not None, '
                             'the number of elements in custom_feature_names '
                             'must equal the number of columns in X.')

        if (not isinstance(self.max_features, int)
                or (self.max_features > X.shape[1] or self.max_features < 1)):
            raise AttributeError('max_features must be'
                                 ' smaller than %d and larger than 0' %
                                 (X.shape[1] + 1))

        if (not isinstance(self.min_features, int)
                or (self.min_features > X.shape[1] or self.min_features < 1)):
            raise AttributeError('min_features must be'
                                 ' smaller than %d and larger than 0' %
                                 (X.shape[1] + 1))

        if self.max_features < self.min_features:
            raise AttributeError('min_features must be <= max_features')

        candidates = chain(
            *((combinations(range(X_.shape[1]), r=i))
              for i in range(self.min_features, self.max_features + 1)))

        def ncr(n, r):
            """Return the number of combinations of length r from n items.

            Parameters
            ----------
            n : {integer}
            Total number of items
            r : {integer}
            Number of items to select from n

            Returns
            -------
            Number of combinations, integer

            """

            r = min(r, n - r)
            if r == 0:
                return 1
            numer = reduce(op.mul, range(n, n - r, -1))
            denom = reduce(op.mul, range(1, r + 1))
            return numer // denom

        all_comb = np.sum([
            ncr(n=X_.shape[1], r=i)
            for i in range(self.min_features, self.max_features + 1)
        ])

        n_jobs = min(self.n_jobs, all_comb)
        parallel = Parallel(n_jobs=n_jobs, pre_dispatch=self.pre_dispatch)
        work = enumerate(
            parallel(
                delayed(_calc_score)(
                    self, X_, y, c, groups=groups, **fit_params)
                for c in candidates))

        try:
            for iteration, (c, cv_scores) in work:

                self.subsets_[iteration] = {
                    'feature_idx': c,
                    'cv_scores': cv_scores,
                    'avg_score': np.mean(cv_scores)
                }

                if self.print_progress:
                    sys.stderr.write('\rFeatures: %d/%d' %
                                     (iteration + 1, all_comb))
                    sys.stderr.flush()

                if self._TESTING_INTERRUPT_MODE:
                    self.subsets_, self.best_feature_names_ = \
                        _get_featurenames(self.subsets_,
                                          self.best_idx_,
                                          custom_feature_names,
                                          X)
                    raise KeyboardInterrupt

        except KeyboardInterrupt as e:
            self.interrupted_ = True
            sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...')

        max_score = float('-inf')
        for c in self.subsets_:
            if self.subsets_[c]['avg_score'] > max_score:
                max_score = self.subsets_[c]['avg_score']
                best_subset = c
        score = max_score
        idx = self.subsets_[best_subset]['feature_idx']

        self.best_idx_ = idx
        self.best_score_ = score
        self.fitted = True
        self.subsets_, self.best_feature_names_ = \
            _get_featurenames(self.subsets_,
                              self.best_idx_,
                              custom_feature_names,
                              X)
        return self
Example #23
0
    def fit(self, X, y, X_val=None, y_val=None, **kwargs):
        """Fit underlying estimators.

        If the number of classes = 2, only one model is trained to predict the
        class 1 (second column)
        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_classes]
            Data.
        y : (sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
            Multi-class targets. An indicator matrix turns on multilabel
            classification.
        Returns
        -------
        self
        """
        # A sparse LabelBinarizer, with sparse_output=True, has been shown to
        # outpreform or match a dense label binarizer in all cases and has also
        # resulted in less or equal memory consumption in the fit_ovr function
        # overall.
        if X.shape[1] == 2:
            x_columns = (X[:, 1].ravel().T, )
        else:
            x_columns = (col.ravel() for col in X.T)

        self.label_binarizer_ = LabelBinarizer(sparse_output=True)
        Y = self.label_binarizer_.fit_transform(y)
        Y = Y.tocsc()
        self.classes_ = self.label_binarizer_.classes_
        y_columns = (col.toarray().ravel() for col in Y.T)

        if 'X_val' in inspect.getargspec(self.estimator.fit).args and \
            X_val is not None:
            if X_val.shape[1] == 2:
                x_val_columns = (X_val[:, 1].ravel().T, )
            else:
                x_val_columns = (col.ravel() for col in X_val.T)

            Y_val = self.label_binarizer_.transform(y_val)
            Y_val = Y_val.tocsc()
            y_val_columns = (col.toarray().ravel() for col in Y_val.T)
        else:
            x_val_columns = [None] * np.shape(Y)[0]
            y_val_columns = [None] * np.shape(Y)[0]

        # In cases where individual estimators are very fast to train setting
        # n_jobs > 1 in can results in slower performance due to the overhead
        # of spawning threads.  See joblib issue #112.
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_binary)(self.estimator,
                                 x_column,
                                 y_column,
                                 x_val_column,
                                 y_val_column,
                                 classes=[
                                     "not %s" %
                                     self.label_binarizer_.classes_[i],
                                     self.label_binarizer_.classes_[i]
                                 ])
            for i, (x_column, y_column, x_val_column, y_val_column) in
            enumerate(zip(x_columns, y_columns, x_val_columns, y_val_columns)))

        return self
Example #24
0
    def extract_dataset(self,
                        dataset,
                        n_jobs=-1,
                        verbosity=2,
                        calc4train_set=False):
        if verbosity > 1:
            print(
                "   Calculating Histograms %s, %s" %
                (colorspace_name[self._colorspace], str(self._original_bins)))
        if calc4train_set:
            images = dataset.probe.images_train + dataset.probe.images_test
            images += dataset.gallery.images_train + dataset.gallery.images_test
        else:
            images = dataset.probe.images_test
            images += dataset.gallery.images_test

        if dataset.probe.masks_test:
            if calc4train_set:
                masks = dataset.probe.masks_train + dataset.probe.masks_test
                masks += dataset.gallery.masks_train + dataset.gallery.masks_test
            else:
                masks = dataset.probe.masks_test
                masks += dataset.gallery.masks_test
        else:
            masks = [None] * (len(images))

        if dataset.probe.regions_test:
            if calc4train_set:
                regions = dataset.probe.regions_train + dataset.probe.regions_test
                regions += dataset.gallery.regions_train + dataset.gallery.regions_test
            else:
                regions = dataset.probe.regions_test
                regions += dataset.gallery.regions_test
        else:
            regions = [None] * (len(images))

        if dataset.probe.maps_test:
            if calc4train_set:
                maps = dataset.probe.maps_train + dataset.probe.maps_test
                maps += dataset.gallery.maps_train + dataset.galley.maps_test
            else:
                maps = dataset.probe.maps_test
                maps += dataset.gallery.maps_test
        else:
            maps = [None] * (len(images))

        args = ((im, mask, region, m)
                for im, mask, region, m in zip(images, masks, regions, maps))

        results = Parallel(n_jobs)(
            delayed(_parallel_transform)(self, im, mask, reg, m)
            for im, mask, reg, m in args)

        test_len = dataset.test_size
        if calc4train_set:
            train_len = dataset.train_size
            dataset.probe.fe_train = np.asarray(results[:train_len])
            dataset.probe.fe_test = np.asarray(results[train_len:train_len +
                                                       test_len])
            dataset.gallery.fe_train = np.asarray(results[train_len +
                                                          test_len:-test_len])
            dataset.gallery.fe_test = np.asarray(results[-test_len:])
        else:
            dataset.probe.fe_test = np.asarray(results[:test_len])
            dataset.gallery.fe_test = np.asarray(results[-test_len:])
Example #25
0
# generate cross validation values for leave-one-value-out or k-fold
assert ('foldAttribute' in p) or ('foldCount' in p)
if 'foldAttribute' in p:
    headers = load_arff_headers(input_fn)
    fold_values = headers[p['foldAttribute']]
else:
    fold_values = range(int(p['foldCount']))
nested_fold_values = range(int(p['nestedFoldCount']))
bag_count = int(p['bagCount'])
bag_values = range(bag_count) if bag_count > 1 else [0]

# ensure java's classpath is set
classpath = environ['CLASSPATH']

# command for cluster execution if enabled
use_cluster = False if 'useCluster' not in p else p['useCluster'] == 'true'
cluster_cmd = 'rc.py --cores 1 --walltime 06:00:00 --queue small --allocation acc_9'

# load classifiers from file, skip commented lines
classifiers = filter(lambda x: not x.startswith('#'),
                     open(classifiers_fn).readlines())
classifiers = [_.strip() for _ in classifiers]

working_dir = dirname(abspath(argv[0]))
n_jobs = 1 if use_cluster else -1  #3
all_parameters = list(
    product([working_dir], [project_path], classifiers, fold_values,
            bag_values))
Parallel(n_jobs=n_jobs, verbose=50)(delayed(classify)(parameters)
                                    for parameters in all_parameters)
Example #26
0
def compute_multi_epi_mask(epi_imgs, lower_cutoff=0.2, upper_cutoff=0.9,
                           connected=True, opening=2, threshold=0.5,
                           target_affine=None, target_shape=None,
                           exclude_zeros=False, n_jobs=1,
                           memory=None, verbose=0):
    """ Compute a common mask for several sessions or subjects of fMRI data.

    Uses the mask-finding algorithms to extract masks for each session
    or subject, and then keep only the main connected component of the
    a given fraction of the intersection of all the masks.

    Parameters
    ----------
    epi_imgs: list of Niimgs
        A list of arrays, each item being a subject or a session.
        3D and 4D images are accepted.
        If 3D images is given, we suggest to use the mean image of each
        session

    threshold: float, optional
        the inter-session threshold: the fraction of the
        total number of session in for which a voxel must be in the
        mask to be kept in the common mask.
        threshold=1 corresponds to keeping the intersection of all
        masks, whereas threshold=0 is the union of all masks.

    lower_cutoff: float, optional
        lower fraction of the histogram to be discarded.

    upper_cutoff: float, optional
        upper fraction of the histogram to be discarded.

    connected: boolean, optional
        if connected is True, only the largest connect component is kept.

    exclude_zeros: boolean, optional
        Consider zeros as missing values for the computation of the
        threshold. This option is useful if the images have been
        resliced with a large padding of zeros.

    target_affine: 3x3 or 4x4 matrix, optional
        This parameter is passed to image.resample_img. Please see the
        related documentation for details.

    target_shape: 3-tuple of integers, optional
        This parameter is passed to image.resample_img. Please see the
        related documentation for details.

    memory: instance of joblib.Memory or string
        Used to cache the function call.

    n_jobs: integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    Returns
    -------
    mask : 3D nifti-like image
        The brain mask.
    """
    if len(epi_imgs) == 0:
        raise TypeError('An empty object - %r - was passed instead of an '
                        'image or a list of images' % epi_imgs)
    masks = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(compute_epi_mask)(epi_img,
                                  lower_cutoff=lower_cutoff,
                                  upper_cutoff=upper_cutoff,
                                  connected=connected,
                                  opening=opening,
                                  exclude_zeros=exclude_zeros,
                                  target_affine=target_affine,
                                  target_shape=target_shape,
                                  memory=memory)
        for epi_img in epi_imgs)

    mask = intersect_masks(masks, connected=connected, threshold=threshold)
    return mask
    def smacof(similarities,
               metric=True,
               n_components=2,
               init=None,
               n_init=8,
               n_jobs=1,
               max_iter=300,
               verbose=0,
               eps=1e-3,
               random_state=None,
               return_n_iter=False):
        """
        Computes multidimensional scaling using SMACOF (Scaling by Majorizing a
        Complicated Function) algorithm
    
        The SMACOF algorithm is a multidimensional scaling algorithm: it minimizes
        a objective function, the *stress*, using a majorization technique. The
        Stress Majorization, also known as the Guttman Transform, guarantees a
        monotone convergence of Stress, and is more powerful than traditional
        techniques such as gradient descent.
    
        The SMACOF algorithm for metric MDS can summarized by the following steps:
    
        1. Set an initial start configuration, randomly or not.
        2. Compute the stress
        3. Compute the Guttman Transform
        4. Iterate 2 and 3 until convergence.
    
        The nonmetric algorithm adds a monotonic regression steps before computing
        the stress.
    
        Parameters
        ----------
        similarities : symmetric ndarray, shape (n_samples, n_samples)
            similarities between the points
    
        metric : boolean, optional, default: True
            compute metric or nonmetric SMACOF algorithm
    
        n_components : int, optional, default: 2
            number of dimension in which to immerse the similarities
            overridden if initial array is provided.
    
        init : {None or ndarray of shape (n_samples, n_components)}, optional
            if None, randomly chooses the initial configuration
            if ndarray, initialize the SMACOF algorithm with this array
    
        n_init : int, optional, default: 8
            Number of time the smacof algorithm will be run with different
            initialisation. The final results will be the best output of the
            n_init consecutive runs in terms of stress.
    
        n_jobs : int, optional, default: 1
    
            The number of jobs to use for the computation. This works by breaking
            down the pairwise matrix into n_jobs even slices and computing them in
            parallel.
    
            If -1 all CPUs are used. If 1 is given, no parallel computing code is
            used at all, which is useful for debugging. For n_jobs below -1,
            (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
            are used.
    
        max_iter : int, optional, default: 300
            Maximum number of iterations of the SMACOF algorithm for a single run
    
        verbose : int, optional, default: 0
            level of verbosity
    
        eps : float, optional, default: 1e-6
            relative tolerance w.r.t stress to declare converge
    
        random_state : integer or numpy.RandomState, optional
            The generator used to initialize the centers. If an integer is
            given, it fixes the seed. Defaults to the global numpy random
            number generator.
    
        return_n_iter : bool
            Whether or not to return the number of iterations.
    
        Returns
        -------
        X : ndarray (n_samples,n_components)
            Coordinates of the n_samples points in a n_components-space
    
        stress : float
            The final value of the stress (sum of squared distance of the
            disparities and the distances for all constrained points)
    
        n_iter : int
            The number of iterations corresponding to the best stress.
            Returned only if `return_n_iter` is set to True.
    
        Notes
        -----
        "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
        Groenen P. Springer Series in Statistics (1997)
    
        "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
        Psychometrika, 29 (1964)
    
        "Multidimensional scaling by optimizing goodness of fit to a nonmetric
        hypothesis" Kruskal, J. Psychometrika, 29, (1964)
        """

        similarities = check_array(similarities)
        random_state = check_random_state(random_state)

        if hasattr(init, '__array__'):
            init = np.asarray(init).copy()
            if not n_init == 1:
                warnings.warn(
                    'Explicit initial positions passed: '
                    'performing only one init of the MDS instead of %d' %
                    n_init)
                n_init = 1

        best_pos, best_stress = None, None

        if n_jobs == 1:
            for it in range(n_init):
                pos, stress, n_iter_ = function_library._smacof_single(
                    similarities,
                    metric=metric,
                    n_components=n_components,
                    init=init,
                    max_iter=max_iter,
                    verbose=verbose,
                    eps=eps,
                    random_state=random_state)
                if best_stress is None or stress < best_stress:
                    best_stress = stress
                    best_pos = pos.copy()
                    best_iter = n_iter_
        else:
            seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
            results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
                delayed(function_library._smacof_single)(
                    similarities,
                    metric=metric,
                    n_components=n_components,
                    init=init,
                    max_iter=max_iter,
                    verbose=verbose,
                    eps=eps,
                    random_state=seed) for seed in seeds)
            positions, stress, n_iters = zip(*results)
            best = np.argmin(stress)
            best_stress = stress[best]
            best_pos = positions[best]
            best_iter = n_iters[best]

        if return_n_iter:
            return best_pos, best_stress, best_iter
        else:
            return best_pos, best_stress
 if split > 0:
     np.random.shuffle(new_order)
     y_shfl[new_order] = np.copy(y)
     Xm_shfl[new_order, :, :] = np.copy(Xm)
     sw_shfl[new_order] = np.copy(sample_weight)
     cv = StratifiedKFold(y_shfl, k=n_folds)
 # Cross-validation computed in parallel
 # run parallel computation
 out = Parallel(n_jobs=n_cores)(delayed(my_pipeline)(train=train,
                                                     test=test,
                                                     Xm_shfl=Xm_shfl,
                                                     y_shfl=y_shfl,
                                                     sw_shfl=sw_shfl,
                                                     Xmg=Xmg,
                                                     dims=dims,
                                                     fs=fs,
                                                     scaler=scaler,
                                                     clf=clf,
                                                     n_samples=n_samples,
                                                     n_dims=n_dims,
                                                     n_dims_tg=n_dims_tg,
                                                     n_classes=n_classes)
                                for train, test in cv)
 # reorder results folds and splits
 for fold, (train, test) in enumerate(cv):
     all_folds[split, fold, train] = 1
     all_folds[split, fold, test] = 0
     coef[split, fold, :, :, :] = out[fold]['coef']
     if compute_predict:
         predict[split,
                 test, :, :] = out[fold]['predict'][new_order[test], :, :]
Example #29
0
def data_consist(_table1,
                 _table2,
                 _key1,
                 _key2,
                 _schema1,
                 _schema2,
                 fname,
                 sample_size=1.0,
                 feature_colname1='column',
                 feature_colname2='column',
                 dtype_colname1='type',
                 dtype_colname2='type',
                 output_root='',
                 keep_images=False,
                 n_jobs=1):
    """
	Check consistency between two tables

	Parameters
	----------
	_table1: pandas DataFrame
		one of the two tables to compare
	_table2: pandas DataFrame
		one of the two tables to compare
	_key1: string
		key for table1
	_key2: string
		key for table2
	_schema1: pandas DataFrame
		data schema (contains column names and corresponding data types) for _table1
	_schema2: pandas DataFrame
		data schema (contains column names and corresponding data types) for _table2
	fname: string
		the output file name
	sample_size: integer or float(<=1.0), default=1.0
		int: number of sample rows to do the comparison (useful for large tables)
		float: sample size in percentage
	feature_colname1: string, default='column'
		name of the column for feature of _table1
	feature_colname2: string, default='column'
		name of the column for feature of _table2
	dtype_colname1: string, default='type'
		name of the column for data type of _table1
	dtype_colname2: string, default='type'
		name of the column for data type of _table2
	output_root: string, default=''
		the root directory for the output file
	keep_images: boolean, default=False
		whether to keep all generated images
	n_jobs: int, default=1
		the number of jobs to run in parallel
	"""

    # create a new workbook to store everything
    wb = openpyxl.Workbook()

    # prepare directory for generated images
    img_dir = 'img_temp'
    if os.path.isdir(img_dir):
        shutil.rmtree(img_dir)
    os.mkdir(img_dir)

    # copy data tables
    table1 = _table1.copy()
    table2 = _table2.copy()

    # calculate the sample size
    if sample_size <= 1.0:
        both_keys = list(
            set(table1[_key1].values).intersection(set(table2[_key2].values)))
        sample_size = np.min([
            int(table1.shape[0] * sample_size),
            int(table2.shape[0] * sample_size),
            len(both_keys)
        ])
        sample_keys = np.random.choice(both_keys, sample_size, replace=False)
        table1 = table1[table1[_key1].isin(sample_keys)].reset_index(drop=True)
        table2 = table2[table2[_key2].isin(sample_keys)].reset_index(drop=True)

    # copy both schema
    schema1 = _schema1.copy()[[feature_colname1,
                               dtype_colname1]].rename(columns={
                                   feature_colname1: 'column_1',
                                   dtype_colname1: 'type_1'
                               })
    schema2 = _schema2.copy()[[feature_colname2,
                               dtype_colname2]].rename(columns={
                                   feature_colname2: 'column_2',
                                   dtype_colname2: 'type_2'
                               })

    # merge two schemas
    schema = schema1.merge(schema2,
                           left_on='column_1',
                           right_on='column_2',
                           how='outer')

    # if data types are different in schema1 and schema2, move to error
    schema_error = schema[schema['type_1'] != schema['type_2']].reset_index(
        drop=True)
    schema_error['error'] = "inconsistent data types"
    schema_error.loc[schema_error['column_1'].isnull(),
                     'error'] = "column not in table1"
    schema_error.loc[schema_error['column_2'].isnull(),
                     'error'] = "column not in table2"
    schema_correct = schema[schema['type_1'] == schema['type_2']].reset_index(
        drop=True)

    # classify the features to compare
    key_features = schema_correct[schema_correct['type_1'] ==
                                  'key']['column_1'].values
    numeric_features = schema_correct[schema_correct['type_1'] ==
                                      'numeric']['column_1'].values
    string_features = schema_correct[schema_correct['type_1'] ==
                                     'str']['column_1'].values
    date_features = schema_correct[schema_correct['type_1'] ==
                                   'date']['column_1'].values

    corr_results = []

    # for key features
    # only check features in both tables
    key_features = [
        feat for feat in key_features
        if (feat in table1.columns.values) and (feat in table2.columns.values)
    ]
    if len(key_features) > 0:
        _n_jobs = np.min([n_jobs, len(key_features)])
        key_results = Parallel(n_jobs=_n_jobs)(
            delayed(_compare_key)(col, table1[[col]], table2[[col]], img_dir)
            for col in key_features)

        for key_result in key_results:
            if 'corr' in key_result.keys():
                corr_results.append(key_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title='key')
        _insert_numeric_results(key_results, ws, 40, img_dir)

    # for numeric features
    # only check features in both tables
    numeric_features = [
        feat for feat in numeric_features
        if (feat in table1.columns.values) and (feat in table2.columns.values)
    ]
    if len(numeric_features) > 0:
        _n_jobs = np.min([n_jobs, len(numeric_features)])
        numeric_results = Parallel(n_jobs=_n_jobs)(
            delayed(_consist_numeric)(col, table1[[_key1, col]], table2[
                [_key2, col]], _key1, _key2, img_dir)
            for col in numeric_features)

        for numeric_result in numeric_results:
            if 'corr' in numeric_result.keys():
                corr_results.append(numeric_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title='numeric')
        _insert_numeric_results(numeric_results, ws, 45, img_dir)

    # for string features
    # only check features in both tables
    string_features = [
        feat for feat in string_features
        if (feat in table1.columns.values) and (feat in table2.columns.values)
    ]
    if len(string_features) > 0:
        _n_jobs = np.min([n_jobs, len(string_features)])
        string_results = Parallel(n_jobs=_n_jobs)(delayed(_consist_string)(
            col, table1[[_key1, col]], table2[[_key2, col]], _key1, _key2)
                                                  for col in string_features)

        for string_result in string_results:
            if 'corr' in string_result.keys():
                corr_results.append(string_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title='string')
        _insert_string_results(string_results, ws, 25)

    # for date features
    # only check features in both tables
    date_features = [
        feat for feat in date_features
        if (feat in table1.columns.values) and (feat in table2.columns.values)
    ]
    if len(date_features) > 0:
        # get the current time
        snapshot_date_now = str(datetime.datetime.now().date())
        for col in date_features:
            table1[col] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime(
                table1[col], errors='coerce')).astype('timedelta64[M]',
                                                      errors='ignore')
            table2[col] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime(
                table2[col], errors='coerce')).astype('timedelta64[M]',
                                                      errors='ignore')
        _n_jobs = np.min([n_jobs, len(date_features)])
        date_results = Parallel(n_jobs=_n_jobs)(
            delayed(_consist_numeric)(col,
                                      table1[[_key1, col]],
                                      table2[[_key2, col]],
                                      _key1,
                                      _key2,
                                      img_dir,
                                      date_flag=True) for col in date_features)

        for date_result in date_results:
            if 'corr' in date_result.keys():
                corr_results.append(date_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title='date')
        _insert_numeric_results(date_results, ws, 45, img_dir, date_flag=True)

    # insert the summary
    ws = wb['Sheet']
    ws.title = 'summary'
    summary_df = schema_correct[['column_1', 'type_1']].rename(columns={
        'column_1': 'column',
        'type_1': 'type'
    })
    corr_df = pd.DataFrame(corr_results)
    summary_df = summary_df.merge(corr_df, on='column', how='left')
    summary_df['corr'] = summary_df['corr'].fillna('error')
    summary_df['error_flg'] = summary_df['corr'].apply(lambda x: 1
                                                       if x == 'error' else 0)
    error_rows = summary_df[summary_df['error_flg'] == 1].index.values

    _ = _insert_df(summary_df[['column', 'type', 'corr']], ws, header=True)

    for r_idx in error_rows:
        ws['C%d' % (r_idx + 2)].style = 'Bad'
    _adjust_ws(ws=ws, row_height=25)

    # if there are some errors
    if len(schema_error) > 0:
        ws = wb.create_sheet(title='error')
        _ = _insert_df(schema_error, ws, header=True)
        _adjust_ws(ws=ws, row_height=25)

    wb.save(filename=os.path.join(output_root, 'data_consist_%s.xlsx' %
                                  (fname)))
    if not keep_images:
        shutil.rmtree(img_dir)
Example #30
0
    def _tell(self, x, y, constraints=None, fit=True):
        """Perform the actual work of incorporating one or more new points.
        See `tell()` for the full description.

        This method exists to give access to the internals of adding points
        by side stepping all input validation and transformation."""

        if "ps" in self.acq_func:
            if is_2Dlistlike(x):
                self.Xi.extend(x)
                self.yi.extend(y)
                self._n_initial_points -= len(y)
            elif is_listlike(x):
                self.Xi.append(x)
                self.yi.append(y)
                self._n_initial_points -= 1
        # if y isn't a scalar it means we have been handed a batch of points
        elif is_listlike(y) and is_2Dlistlike(x):
            self.Xi.extend(x)
            self.yi.extend(y)
            if constraints is not None:
                self.constraints.extend(constraints)
            self._n_initial_points -= len(y)
        elif is_listlike(x):
            self.Xi.append(x)
            self.yi.append(y)
            if constraints is not None:
                self.constraints.append(constraints)
            self._n_initial_points -= 1
        else:
            raise ValueError("Type of arguments `x` (%s) and `y` (%s) "
                             "not compatible." % (type(x), type(y)))

        # optimizer learned something new - discard cache
        self.cache_ = {}

        # after being "told" n_initial_points we switch from sampling
        # random points to using a surrogate model
        if (fit and self._n_initial_points <= 0 and
                self.base_estimator_ is not None):
            transformed_bounds = np.array(self.space.transformed_bounds)
            est = clone(self.base_estimator_)
            if constraints is not None:
                est_c = clone(self.constraint_estimator_)

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                est.fit(self.space.transform(self.Xi), self.yi)

                if constraints is not None:
                    est_c.fit(self.space.transform(self.Xi), self.constraints)

            if hasattr(self, "next_xs_") and self.acq_func == "gp_hedge":
                self.gains_ -= est.predict(np.vstack(self.next_xs_))
            self.models.append(est)
            self.constraint_models.append(est)

            # even with BFGS as optimizer we want to sample a large number
            # of points and then pick the best ones as starting points
            X = self.space.transform(self.space.rvs(
                n_samples=self.n_points, random_state=self.rng))

            if self.solution_processor is not None:
                for i in range(len(X)):
                    x = self.solution_processor(X[i])
                    X[i] = list(np.concatenate(x))

            self.next_xs_ = []
            for cand_acq_func in self.cand_acq_funcs_:
                if self.constraint_estimator_ is not None:
                    mask = np.array(self.constraints) >= 0
                    if np.any(mask):
                        y_opt = np.min(np.array(self.yi)[mask])
                        values = _gaussian_acquisition(
                            X=X, model=est, y_opt=y_opt,
                            acq_func=cand_acq_func,
                            acq_func_kwargs=self.acq_func_kwargs)
                    else:
                        values = np.ones(X.shape[0])
                else:
                    values = _gaussian_acquisition(
                        X=X, model=est, y_opt=np.min(self.yi),
                        acq_func=cand_acq_func,
                        acq_func_kwargs=self.acq_func_kwargs)

                if self.constraint_estimator_ is not None:
                    (means, stds) = est_c.predict(X, return_std=True)
                    scaled = np.divide(means, stds)
                    constraint_values = norm.cdf(scaled)
                    values = np.multiply(values, constraint_values)
                # Find the minimum of the acquisition function by randomly
                # sampling points from the space
                if self.acq_optimizer == "sampling":
                    order = np.argsort(values)
                    for i in range(order.size):
                        next_x = X[i]
                        if list(X[i]) not in self.Xi:
                            break

                # Use BFGS to find the mimimum of the acquisition function, the
                # minimization starts from `n_restarts_optimizer` different
                # points and the best minimum is used

                elif self.acq_optimizer == "lbfgs":
                    x0 = X[np.argsort(values)[:self.n_restarts_optimizer]]

                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        results = Parallel(n_jobs=self.n_jobs)(
                            delayed(fmin_l_bfgs_b)(
                                gaussian_acquisition_1D, x,
                                args=(est, np.min(self.yi), cand_acq_func,
                                      self.acq_func_kwargs),
                                bounds=self.space.transformed_bounds,
                                approx_grad=False,
                                maxiter=20)
                            for x in x0)

                    cand_xs = np.array([r[0] for r in results])
                    cand_acqs = np.array([r[1] for r in results])
                    next_x = cand_xs[np.argmin(cand_acqs)]

                # lbfgs should handle this but just in case there are
                # precision errors.
                if not self.space.is_categorical:
                    next_x = np.clip(
                        next_x, transformed_bounds[:, 0],
                        transformed_bounds[:, 1])
                self.next_xs_.append(next_x)

            if self.acq_func == "gp_hedge":
                logits = np.array(self.gains_)
                logits -= np.max(logits)
                exp_logits = np.exp(self.eta * logits)
                probs = exp_logits / np.sum(exp_logits)
                next_x = self.next_xs_[np.argmax(self.rng.multinomial(1,
                                                                      probs))]
            else:
                next_x = self.next_xs_[0]

            # note the need for [0] at the end
            self._next_x = self.space.inverse_transform(
                next_x.reshape((1, -1)))[0]

        # Pack results
        return create_result(self.Xi, self.yi, self.space, self.rng,
                             models=self.models, constraints=self.constraints)
def run_perm_analysis(save_folder,
                      domains='all',
                      n_jobs=10,
                      use_summary=False,
                      type_of_analysis='any_anxiety',
                      n_perm=1000,
                      seed=None,
                      n_jobs_rf=2,
                      cat_encoding=None):

    if seed is None:
        seed = int(time())

    target_col = ['persistance_anxiety', 'pureanxiety']
    df, df_dtype, y = get_data(
        modality_name=domains,
        load_df=NESDA_FILE_MISSING,
        load_df_dtypes=NESDA_FILE_MISSING_DTYPE,
        load_df_summary=NESDA_FILE_MISSING_SUMMARY,
        load_df_dtypes_summary=NESDA_FILE_MISSING_SUMMARY_DTYPE,
        load_df_labels=NESDA_FILE_LABELS,
        use_summary=use_summary,
        target_col=target_col)

    y, multiclass = create_labels(y, type_of_analysis)

    df, cat_vars = impute_data(df, df_dtype)
    X, var_names = categorical_encoding(df,
                                        y,
                                        cat_vars,
                                        np.arange(df.shape[0]),
                                        method=cat_encoding)
    n_subj, n_features = X.shape
    estimator = get_classifier(n_subj,
                               random_state=seed,
                               n_jobs_rf=n_jobs_rf,
                               multiclass=multiclass)

    estimator.fit(X, y)
    feat_imp_true = estimator.feature_importances_
    perm_col = ['perm_{}'.format(i_perm + 1) for i_perm in range(n_perm)]

    df_feat_imp = pd.DataFrame(index=var_names,
                               columns=['true_feature_importances'] + perm_col)
    df_feat_imp['true_feature_importances'] = feat_imp_true

    for i_feature in range(X.shape[1]):
        print('{}/{}; Feature: {}'.format(i_feature + 1, X.shape[1],
                                          var_names[i_feature]))
        X_perm = X.copy()
        res = Parallel(n_jobs=n_jobs,
                       verbose=1,
                       pre_dispatch='2*n_jobs',
                       max_nbytes='50M')(delayed(permute_feature)(clone(
                           estimator), X_perm, y, i_feature)
                                         for _ in range(n_perm))
        df_feat_imp.loc[var_names[i_feature], perm_col] = res

    df_feat_imp.to_csv(
        osp.join(
            save_folder,
            'permuted_variable_importances_domains_{}.csv'.format(domains)))
    np.save(
        osp.join(
            save_folder,
            'permuted_variable_importances_domains_{}_seed.npy'.format(
                domains)), np.array([seed]))
Example #32
0
    def fit(self, niimgs=None, y=None, confounds=None):
        """Compute the mask and the components

        Parameters
        ----------
        niimgs: list of filenames or NiImages
            Data on which the PCA must be calculated. If this is a list,
            the affine is considered the same for all.
        """
        # Hack to support single-subject data:
        if isinstance(niimgs, (basestring, nibabel.Nifti1Image)):
            niimgs = [niimgs]
            # This is a very incomplete hack, as it won't work right for
            # single-subject list of 3D filenames
        # First, learn the mask
        if not isinstance(self.mask, MultiNiftiMasker):
            self.masker_ = MultiNiftiMasker(mask=self.mask,
                                            smoothing_fwhm=self.smoothing_fwhm,
                                            target_affine=self.target_affine,
                                            target_shape=self.target_shape,
                                            low_pass=self.low_pass,
                                            high_pass=self.high_pass,
                                            t_r=self.t_r,
                                            memory=self.memory,
                                            memory_level=self.memory_level)
        else:
            try:
                self.masker_ = clone(self.mask)
            except TypeError as e:
                # Workaround for a joblib bug: in joblib 0.6, a Memory object
                # with cachedir = None cannot be cloned.
                masker_memory = self.mask.memory
                if masker_memory.cachedir is None:
                    self.mask.memory = None
                    self.masker_ = clone(self.mask)
                    self.mask.memory = masker_memory
                    self.masker_.memory = Memory(cachedir=None)
                else:
                    # The error was raised for another reason
                    raise e

            for param_name in [
                    'target_affine', 'target_shape', 'smoothing_fwhm',
                    'low_pass', 'high_pass', 't_r', 'memory', 'memory_level'
            ]:
                if getattr(self.masker_, param_name) is not None:
                    warnings.warn('Parameter %s of the masker overriden' %
                                  param_name)
                setattr(self.masker_, param_name, getattr(self, param_name))
        if self.masker_.mask is None:
            self.masker_.fit(niimgs)
        else:
            self.masker_.fit()
        self.mask_img_ = self.masker_.mask_img_

        parameters = get_params(MultiNiftiMasker, self)
        parameters['detrend'] = True
        parameters['standardize'] = True

        # Now do the subject-level signal extraction (i.e. data-loading +
        # PCA)

        subject_pcas = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
            delayed(session_pca)(niimg,
                                 self.masker_.mask_img_,
                                 parameters,
                                 n_components=self.n_components,
                                 memory=self.memory,
                                 ref_memory_level=self.memory_level,
                                 confounds=confounds,
                                 verbose=self.verbose) for niimg in niimgs)
        subject_pcas, subject_svd_vals = zip(*subject_pcas)

        if len(niimgs) > 1:
            if not self.do_cca:
                for subject_pca, subject_svd_val in \
                        zip(subject_pcas, subject_svd_vals):
                    subject_pca *= subject_svd_val[:, np.newaxis]
            data = np.empty(
                (len(niimgs) * self.n_components, subject_pcas[0].shape[1]),
                dtype=subject_pcas[0].dtype)
            for index, subject_pca in enumerate(subject_pcas):
                if self.n_components > subject_pca.shape[0]:
                    raise ValueError('You asked for %i components.'
                                     'This is smaller than single-subject '
                                     'data size.' % self.n_components)
                data[index * self.n_components:(index + 1) *
                     self.n_components] = subject_pca
            data, variance, _ = randomized_svd(data.T,
                                               n_components=self.n_components)
            data = data.T
        else:
            data = subject_pcas[0]
        self.components_ = data
        return self
Example #33
0
def _fit(self, X, y, parameter_iterable):
    """Actual fitting,  performing the search over parameters."""
    estimator = self.estimator
    foldsForEstimator = {}
    cv = self.cv

    self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

    n_samples = _num_samples(X)
    X, y = indexable(X, y)

    if y is not None:
        if len(y) != n_samples:
            raise ValueError('Target variable (y) has a different number '
                             'of samples (%i) than data (X: %i samples)' %
                             (len(y), n_samples))

    from collections import Sized
    # Splits the data based on provided cross-validation splitting strategy.
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    if self.verbose > 0:
        if isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling \
                {2} fits".format(len(cv), n_candidates,
                                 n_candidates * len(cv)))

    base_estimator = clone(self.estimator)

    pre_dispatch = self.pre_dispatch

    # Change from original scikit code: adding a new argument,
    # foldsForEstimator, to the _fit_and_score function to track metadata
    # for each estimator, for each fold.
    # _fit_and_score fits the estimator and computes the score for a given
    # data-split, for given parameters.
    out = Parallel(n_jobs=self.n_jobs,
                   verbose=self.verbose,
                   pre_dispatch=pre_dispatch)(
                       delayed(_fit_and_score)(clone(base_estimator),
                                               X,
                                               y,
                                               self.scorer_,
                                               train,
                                               test,
                                               self.verbose,
                                               parameters,
                                               self.fit_params,
                                               foldsForEstimator,
                                               return_parameters=True,
                                               error_score=self.error_score)
                       for parameters in parameter_iterable
                       for train, test in cv)

    # Out is a list of triplet: score, estimator, n_test_samples
    n_fits = len(out)
    n_folds = len(cv)

    # Computes the scores for each of the folds, for all the possible
    # parameters, and stores them in grid_scores.
    scores = list()
    grid_scores = list()
    for grid_start in range(0, n_fits, n_folds):
        n_test_samples = 0
        score = 0
        all_scores = []
        for this_score, this_n_test_samples, _, parameters in out[
                grid_start:grid_start + n_folds]:
            all_scores.append(this_score)
            if self.iid:
                this_score *= this_n_test_samples
                n_test_samples += this_n_test_samples
            score += this_score
        if self.iid:
            score /= float(n_test_samples)
        else:
            score /= float(n_folds)
        scores.append((score, parameters))
        # TODO: shall we also store the test_fold_sizes?
        grid_scores.append(
            CVScoreTuple(parameters, score, np.array(all_scores)))

    # Store the computed scores
    self.grid_scores_ = grid_scores

    # Find the best parameters by comparing on the mean validation score:
    # note that `sorted` is deterministic in the way it breaks ties
    best = sorted(grid_scores,
                  key=lambda x: x.mean_validation_score,
                  reverse=True)[0]
    self.best_params_ = best.parameters
    self.best_score_ = best.mean_validation_score

    if self.refit:
        # fit the best estimator using the entire dataset
        # clone first to work around broken estimators
        best_estimator = clone(base_estimator).set_params(**best.parameters)
        if y is not None:
            best_estimator.fit(X, y, **self.fit_params)
        else:
            best_estimator.fit(X, **self.fit_params)
        self.best_estimator_ = best_estimator
    else:
        # If refit is false, we cannot _best_estimator_ is unavailable, and
        # further predictions can't be made on instance
        raise Warning(
            "Note: Refit has been set to false, which makes it impossible to "
            "make predictions using this GridSearchCV instance after fitting. "
            "Change refit to true to enable this")

    # Change from original scikit code:
    # Populate new field with necessary attributes for storing
    # cross-validation event
    self.grid_cv_event = [
        X, foldsForEstimator, 0,
        type_of_target(y), self.best_estimator_, self.best_estimator_, n_folds
    ]
    return self
Example #34
0
 def fit(self, X, y):
     result = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
         delayed(_avgest_fit_est)(est, i, X, y, self.verbose)
         for i, est in enumerate(self.estimators))
     self.estimators = result
     return self
Example #35
0
	feature_list = ['user_attributes'] 

	# trainfeatures: feature list of user attributes, where each user attribute has a list of relevant products.
	# trainlabels: opens Pickle file containing list of products that each contain a list of 0, 1 encoded user attributes.
	trainfeatures = loadFeatures(feature_list, mode='train')
	print len(trainfeatures)
	#print trainfeatures
	trainlabels = cPickle.load(open(trainPath + 'labels.train', 'rb'))
	print len(trainlabels)
	print "Loaded train features and train labels" 
        
	cv_products = KFold(len(trainlabels), n_folds=5, indices=True, shuffle=True, random_state=1)
	print "Set up KFold."
    
	score = Parallel(n_jobs=-1)(delayed(crossValidation)(trainlabels, trainfeatures, classifier, train_products, test_products, pairwise=False) for train_products, test_products in cv_products)

	score = np.array(score)
	print 'score mean, std, mean-std:', score.mean(), score.std(), score.mean() - score.std()











 X_train, X_test, y_train, y_test = cross_validation.train_test_split(im_features, np.array(image_classes), test_size=0.26, random_state=0)
 clf = SVC(C = 100, kernel='rbf').fit(X_train, y_train)
 scores = clf.score(X_test, y_test) 
 print scores
 """
 
 X_train, X_test, y_train, y_test = cross_validation.train_test_split(im_features, np.array(image_classes), test_size=0.05, random_state=0)
 gamma_range = np.power (10., np.arange (-5, 5, 0.5));
 C_range = np.power (10., np.arange (-5, 5));
 grid_search_params = \
   [{'kernel' : ['rbf'], 'C' : C_range, 'gamma' : gamma_range},\
    {'kernel' : ['linear'], 'C' : C_range}];
 
 classifier = svm.SVC
 
 grid_search_ans = Parallel(n_jobs = -1)(delayed(run_gridSearch)(classifier, args, X_train, y_train, X_test, y_test) for args in list(grid_search.ParameterGrid(grid_search_params)))
 
 best_params = list(grid_search.ParameterGrid(grid_search_params))[grid_search_ans.index(max(grid_search_ans))]
 
 clf = classifier(**best_params).fit(X_train, y_train)
 
 pred = clf.predict(X_test)
 
 print metrics.classification_report (pred, y_test)
 
 print 'accuracy: ', metrics.accuracy_score(pred, y_test)
 
 
 # Save the SVM
 #joblib.dump((clf, training_names, stdSlr, k, voc), "surf_fm_trained.pkl", compress=3)