Esempio n. 1
0
def gradient_clustering(table: pd.DataFrame,
                        gradient: NumericMetadataColumn,
                        weighted: bool = True) -> skbio.TreeNode:
    """ Builds a tree for features based on a gradient.

    Parameters
    ----------
    table : pd.DataFrame
       Contingency table where rows are samples and columns are features.
    gradient : qiime2.NumericMetadataColumn
       Continuous vector of measurements corresponding to samples.
    weighted : bool
       Specifies if abundance or presence/absence information
       should be used to perform the clustering.

    Returns
    -------
    skbio.TreeNode
       Represents the partitioning of features with respect to the gradient.
    """
    c = gradient.to_series()
    if not weighted:
        table = (table > 0).astype(np.float)
    table, c = match(table, c)
    t = gradient_linkage(table, c, method='average')
    mean_g = mean_niche_estimator(table, c)
    mean_g = pd.Series(mean_g, index=table.columns)
    mean_g = mean_g.sort_values()
    t = gradient_sort(t, mean_g)
    return t
Esempio n. 2
0
def distance_matrix(metadata: qiime2.NumericMetadataColumn)\
        -> skbio.DistanceMatrix:
    if metadata.has_missing_values():
        missing = metadata.get_ids(where_values_missing=True)
        raise ValueError(
            "Encountered missing value(s) in the metadata column. Computing "
            "a distance matrix from missing values is not supported. IDs with "
            "missing values: %s" % ', '.join(sorted(missing)))

    # This code is derived from @jairideout's scikit-bio cookbook recipe,
    # "Exploring Microbial Community Diversity"
    # https://github.com/biocore/scikit-bio-cookbook
    series = metadata.to_series()
    distances = scipy.spatial.distance.pdist(series.values[:, np.newaxis],
                                             metric='euclidean')
    return skbio.DistanceMatrix(distances, ids=series.index)
Esempio n. 3
0
def autocorr(output_dir: str,
             distance_matrix: DistanceMatrix,
             metadata: qiime2.NumericMetadataColumn,
             permutations: int = 999,
             two_tailed: bool = True,
             transformation: str = 'R',
             intersect_ids: bool = False) -> None:
    # match ids — metadata can be superset
    metadata = metadata.to_series()
    metadata, distance_matrix = match_ids(metadata,
                                          distance_matrix,
                                          intersect_ids=intersect_ids)

    # compute Moran's I and Geary's class
    results, weights = autocorr_from_dm(metadata,
                                        distance_matrix,
                                        permutations=permutations,
                                        two_tailed=two_tailed,
                                        transformation=transformation)

    mplot = moran_plot(metadata, weights, transformation)

    # Visualize
    save_map(mplot, output_dir)
    mapviz(output_dir, results=results, title='Autocorrelation statistics')
def regress_samples(output_dir: str,
                    table: pd.DataFrame,
                    metadata: qiime2.NumericMetadataColumn,
                    test_size: float = defaults['test_size'],
                    step: float = defaults['step'],
                    cv: int = defaults['cv'],
                    random_state: int = None,
                    n_jobs: int = defaults['n_jobs'],
                    n_estimators: int = defaults['n_estimators'],
                    estimator: str = 'RandomForestRegressor',
                    optimize_feature_selection: bool = False,
                    stratify: str = False,
                    parameter_tuning: bool = False) -> None:

    # extract column name from NumericMetadataColumn
    column = metadata.to_series().name

    # disable feature selection for unsupported estimators
    optimize_feature_selection, calc_feature_importance = \
        _disable_feature_selection(estimator, optimize_feature_selection)

    # specify parameters and distributions to sample from for parameter tuning
    estimator, param_dist, parameter_tuning = _set_parameters_and_estimator(
        estimator,
        table,
        metadata,
        column,
        n_estimators,
        n_jobs,
        cv,
        random_state,
        parameter_tuning,
        classification=True)

    estimator, cm, accuracy, importances = split_optimize_classify(
        table,
        metadata,
        column,
        estimator,
        output_dir,
        test_size=test_size,
        step=step,
        cv=cv,
        random_state=random_state,
        n_jobs=n_jobs,
        optimize_feature_selection=optimize_feature_selection,
        parameter_tuning=parameter_tuning,
        param_dist=param_dist,
        calc_feature_importance=calc_feature_importance,
        scoring=mean_squared_error,
        stratify=stratify,
        classification=False)

    _visualize(output_dir,
               estimator,
               cm,
               accuracy,
               importances,
               optimize_feature_selection,
               title='regression predictions')
Esempio n. 5
0
    def setUp(self):
        self.results = "results"
        if not os.path.exists(self.results):
            os.mkdir(self.results)
        self.balances = pd.DataFrame(
            {
                'a': [-2, -1, 0, 1, 2],
                'b': [-2, 0, 0, 0, 0]
            },
            index=['a1', 'a2', 'a3', 'a4', 'a5'])
        self.tree = TreeNode.read([r'((k, q)d, ((x, y)a, z)b)c;'])

        self.taxonomy = pd.DataFrame(
            [['foo;barf;a;b;c;d;e', 1], ['foo;bark;f;g;h;i;j', 1],
             ['foo;bark;f;g;h;w;j', 1], ['nom;tu;k;l;m;n;o', 0.9],
             ['nom;tu;k;l;m;t;o', 0.9]],
            columns=['Taxon', 'Confidence'],
            index=['x', 'y', 'z', 'k', 'q'])

        self.balances = pd.DataFrame(
            [[1, 2, 3, 4, 5, 6, 7], [-3.1, -2.9, -3, 3, 2.9, 3.2, 3.1],
             [1, 1, 1, 1, 1, 1, 1], [3, 2, 1, 0, -1, -2, -3]],
            index=['d', 'a', 'b', 'c'],
            columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7']).T
        basis, _ = balance_basis(self.tree)
        self.table = pd.DataFrame(
            ilr_inv(self.balances, basis),
            columns=['x', 'y', 'z', 'k', 'q'],
            index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'])

        index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'], name='id')
        self.categorical = CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'a', 'b', 'b', 'b', 'b'],
                      index=index,
                      name='categorical'))
        self.multi_categorical = CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'c', 'b', 'b', 'b', 'c'],
                      index=index,
                      name='multi_categorical'))
        self.partial_numerical_categorical = CategoricalMetadataColumn(
            pd.Series(['1', '1', '1', '2', '2', '2', 'a'],
                      index=index,
                      name='multi_categorical'))
        self.full_numerical_categorical = CategoricalMetadataColumn(
            pd.Series(['1', '1', '1.0', '2', '2', '2.0', '3'],
                      index=index,
                      name='numerical_categorical'))
        self.continuous = NumericMetadataColumn(
            pd.Series(np.arange(7), index=index, name='continuous'))
Esempio n. 6
0
def gradient_clustering(table: pd.DataFrame,
                        gradient: NumericMetadataColumn,
                        ignore_missing_samples: bool = False,
                        weighted: bool = True) -> skbio.TreeNode:
    """ Builds a tree for features based on a gradient.

    Parameters
    ----------
    table : pd.DataFrame
       Contingency table where rows are samples and columns are features.
    gradient : qiime2.NumericMetadataColumn
       Continuous vector of measurements corresponding to samples.
    ignore_missing_samples: bool
        Whether to except or ignore when there are samples present in the table
        that are not present in the gradient metadata.
    weighted : bool
       Specifies if abundance or presence/absence information
       should be used to perform the clustering.

    Returns
    -------
    skbio.TreeNode
       Represents the partitioning of features with respect to the gradient.
    """
    c = gradient.to_series()
    if not ignore_missing_samples:
        difference = set(table.index) - set(c.index)
        if difference:
            raise KeyError("There are samples present in the table not "
                           "present in the gradient metadata column. Override "
                           "this error by using the `ignore_missing_samples` "
                           "argument. Offending samples: %r" %
                           ', '.join(sorted([str(i) for i in difference])))
    if not weighted:
        table = (table > 0).astype(float)
    table, c = match(table, c)
    t = gradient_linkage(table, c, method='average')
    mean_g = mean_niche_estimator(table, c)
    mean_g = pd.Series(mean_g, index=table.columns)
    mean_g = mean_g.sort_values()
    t = gradient_sort(t, mean_g)
    return t
Esempio n. 7
0
def regress(
        features: np.ndarray,
        y: qiime2.NumericMetadataColumn,
        c: np.ndarray = None,
        #PATH parameters :
        path: bool = True,
        path_numerical_method: str = 'not specified',
        path_n_active: int = 0,
        path_lambdas: list = None,
        path_nlam_log: int = 40,
        path_lamin_log: float = 1e-2,

        #CV parameters :
        cv: bool = True,
        cv_numerical_method: str = 'not specified',
        cv_seed: int = 1,
        cv_lambdas: list = None,  # to do 
        cv_one_se: bool = True,
        cv_subsets: int = 5,

        #StabSel parameters :
        stabsel: bool = True,
        stabsel_numerical_method: str = 'not specified',
        stabsel_seed:
    int = None,  # do something here ! for now it can be a bool !
        stabsel_lam: float = -1.0,  # if negative, then it means 'theoretical'
        stabsel_true_lam: bool = True,
        stabsel_method: str = 'first',
        stabsel_b: int = 50,
        stabsel_q: int = 10,
        stabsel_percent_ns: float = 0.5,
        stabsel_lamin: float = 1e-2,
        stabsel_threshold: float = 0.7,
        stabsel_threshold_label:
    float = 0.4,  # might unneeded here, but needed for visualisation

        #LAMfixed parameters :
    lamfixed: bool = True,
        lamfixed_numerical_method: str = 'not specified',
        lamfixed_lam: float = -1.0,  # if negative, then it means 'theoretical'
        lamfixed_true_lam: bool = True,

        #Formulation parameters
        concomitant: bool = True,
        huber: bool = False,
        rho: float = 1.345,
        rescale: bool = False) -> classo_problem:

    y = y.to_series().to_numpy()

    problem = classo_problem(features, y, C=c, rescale=rescale)
    problem.formulation.huber = huber
    problem.formulation.concomitant = concomitant
    problem.formulation.rho = rho

    problem.model_selection.PATH = path
    if path:
        param = problem.model_selection.PATHparameters
        param.numerical_method = path_numerical_method
        param.n_active = path_n_active
        if path_lambdas is None:
            param.lambdas = np.array([
                10**(np.log10(path_lamin_log) * float(i) / path_nlam_log)
                for i in range(0, path_nlam_log)
            ])
        else:
            param.lambdas = path_lambdas

    problem.model_selection.CV = cv
    if cv:
        param = problem.model_selection.CVparameters
        param.numerical_method = cv_numerical_method
        param.seed = cv_seed
        param.oneSE = cv_one_se
        param.Nsubsets = cv_subsets
        if cv_lambdas is None: param.lambdas = np.linspace(1., 1e-3, 500)
        else: param.lambdas = cv_lambdas

    problem.model_selection.StabSel = stabsel
    if stabsel:
        param = problem.model_selection.StabSelparameters
        param.numerical_method = stabsel_numerical_method
        param.seed = stabsel_seed
        param.true_lam = stabsel_true_lam
        param.method = stabsel_method
        param.B = stabsel_b
        param.q = stabsel_q
        param.percent_nS = stabsel_percent_ns
        param.lamin = stabsel_lamin
        param.threshold = stabsel_threshold
        param.threshold_label = stabsel_threshold_label
        if (stabsel_lam > 0.): param.lam = stabsel_lam
        else: param.lam = 'theoretical'

    problem.model_selection.LAMfixed = lamfixed
    if lamfixed:
        param = problem.model_selection.LAMfixedparameters
        param.numerical_method = lamfixed_numerical_method
        param.true_lam = lamfixed_true_lam
        if (lamfixed_lam > 0.): param.lam = lamfixed_lam
        else: param.lam = 'theoretical'

    problem.solve()

    return problem
Esempio n. 8
0
def regress(
    features: pd.DataFrame,
    y: qiime2.NumericMetadataColumn,
    c: np.ndarray = None,
    weights: np.ndarray = None,
    do_yshift: bool = False,
    # taxa: skbio.TreeNode = None,
    # PATH parameters :
    path: bool = True,
    path_numerical_method: str = "not specified",
    path_n_active: int = 0,
    path_nlam_log: int = 40,
    path_lamin_log: float = 1e-2,
    # CV parameters :
    cv: bool = True,
    cv_numerical_method: str = "not specified",
    cv_seed: int = 1,
    cv_one_se: bool = True,
    cv_subsets: int = 5,
    cv_nlam: int = 100,
    cv_lamin: float = 1e-3,
    cv_logscale: bool = True,
    # StabSel parameters :
    stabsel: bool = True,
    stabsel_numerical_method: str = "not specified",
    stabsel_seed: int = None,  # do something here ! for now it can be a bool !
    stabsel_lam: float = -1.0,  # if negative, then it means 'theoretical'
    stabsel_true_lam: bool = True,
    stabsel_method: str = "first",
    stabsel_b: int = 50,
    stabsel_q: int = 10,
    stabsel_percent_ns: float = 0.5,
    stabsel_lamin: float = 1e-2,
    stabsel_threshold: float = 0.7,
    stabsel_threshold_label: float = 0.4,
    # might unneeded here, but needed for visualisation
    # LAMfixed parameters :
    lamfixed: bool = True,
    lamfixed_numerical_method: str = "not specified",
    lamfixed_lam: float = -1.0,  # if negative, then it means 'theoretical'
    lamfixed_true_lam: bool = True,
    # Formulation parameters
    concomitant: bool = True,
    huber: bool = False,
    rho: float = 1.345,
    intercept: bool = True,
) -> classo_problem:

    complete_y = y.to_series()
    complete_y = complete_y[~complete_y.isna()]

    features, pdY = features.align(y.to_series(), join="inner", axis=0)
    missing = pdY.isna()
    training_labels = list(pdY[~missing].index)
    label_missing = list(pdY.index[missing])
    if label_missing:
        print("{} are missing in y ".format(label_missing))
    Y = pdY[~missing].to_numpy()
    X = features.values[~missing, :]

    print(Y.shape, X.shape)

    if do_yshift:
        Y = Y - np.mean(Y)

    problem = classo_problem(X, Y, C=c, label=list(features.columns))
    problem.formulation.huber = huber
    problem.formulation.concomitant = concomitant
    problem.formulation.rho = rho
    problem.formulation.intercept = intercept
    d = X.shape[1]
    if weights is not None:
        if len(weights) < d:
            problem.formulation.w = np.concatenate(
                [weights, np.ones(d - len(weights))], axis=0)
        else:
            problem.formulation.w = weights[:d]

    problem.model_selection.PATH = path
    if path:
        param = problem.model_selection.PATHparameters
        param.numerical_method = path_numerical_method
        param.n_active = path_n_active
        param.logscale = True
        param.Nlam = path_nlam_log
        param.lamin = path_lamin_log

    problem.model_selection.CV = cv
    if cv:
        param = problem.model_selection.CVparameters
        param.numerical_method = cv_numerical_method
        param.seed = cv_seed
        param.oneSE = cv_one_se
        param.Nsubsets = cv_subsets
        param.lamin = cv_lamin
        param.Nlam = cv_nlam
        param.logscale = cv_logscale

    problem.model_selection.StabSel = stabsel
    if stabsel:
        param = problem.model_selection.StabSelparameters
        param.numerical_method = stabsel_numerical_method
        param.seed = stabsel_seed
        param.true_lam = stabsel_true_lam
        param.method = stabsel_method
        param.B = stabsel_b
        param.q = stabsel_q
        param.percent_nS = stabsel_percent_ns
        param.lamin = stabsel_lamin
        param.threshold = stabsel_threshold
        param.threshold_label = stabsel_threshold_label
        if stabsel_lam > 0.0:
            param.lam = stabsel_lam
        else:
            param.lam = "theoretical"

    problem.model_selection.LAMfixed = lamfixed
    if lamfixed:
        param = problem.model_selection.LAMfixedparameters
        param.numerical_method = lamfixed_numerical_method
        param.true_lam = lamfixed_true_lam
        if lamfixed_lam > 0.0:
            param.lam = lamfixed_lam
        else:
            param.lam = "theoretical"

    print("start solve !")
    problem.solve()
    print("finished solve ! ")

    problem.data.complete_y = complete_y.values
    problem.data.complete_labels = list(complete_y.index)
    problem.data.training_labels = training_labels

    return problem
Esempio n. 9
0
def gtr_single_partition(alignment: qiime2.Metadata,
                         time: qiime2.NumericMetadataColumn,
                         n_generations: int,
                         sample_every: int,
                         time_uncertainty: qiime2.NumericMetadataColumn = None,
                         base_freq: str = "estimated",
                         site_gamma: int = 4,
                         site_invariant: bool = True,
                         clock: str = 'ucln',
                         coalescent_model: str = 'skygrid',
                         skygrid_intervals: int = None,
                         skygrid_duration: float = None,
                         print_every: int = None,
                         use_gpu: bool = False,
                         n_threads: int = 1) -> BEASTPosteriorDirFmt:

    if coalescent_model == 'skygrid':
        if skygrid_duration is None or skygrid_intervals is None:
            raise ValueError("skygrid not parameterized (TODO: better error)")

    # Parallelization options
    beast_call = ['beast']
    if use_gpu:
        if n_threads != 1:
            raise ValueError
        beast_call += ['-beagle_GPU', '-beagle_cuda', '-beagle_instances', '1']
    else:
        beast_call += [
            '-beagle_CPU', '-beagle_SSE', '-beagle_instances',
            str(n_threads)
        ]

    # Set up directory format where BEAST will write everything
    result = BEASTPosteriorDirFmt()
    control_file = str(result.control.path_maker())

    ops_file = str(result.ops.path_maker().relative_to(result.path))
    log_file = str(result.log.path_maker().relative_to(result.path))
    trees_file = str(result.trees.path_maker().relative_to(result.path))

    # Setup up samples for templating into control file
    seq_series = alignment.get_column('Sequence').to_series()
    time_series = time.to_series()

    if time_uncertainty is not None:
        uncertainty_series = time_uncertainty.to_series()
    else:
        uncertainty_series = time_series.copy()
        uncertainty_series[...] = None

    samples_df = pd.concat([seq_series, time_series, uncertainty_series],
                           axis='columns',
                           join='inner')
    samples_df.index.name = 'id'
    samples_df.columns = ['seq', 'time', 'time_uncertainty']
    samples_df = samples_df.replace({pd.np.nan: None})
    samples = list(samples_df.itertuples(index=True))

    # Default print behavior
    if print_every is None:
        print_every = sample_every

    # Generate control file for BEAST
    template_kwargs = dict(trees_file=trees_file,
                           ops_file=ops_file,
                           log_file=log_file,
                           sample_every=sample_every,
                           print_every=print_every,
                           n_generations=n_generations,
                           time_unit='years',
                           samples=samples,
                           base_freq=base_freq,
                           site_gamma=site_gamma,
                           site_invariant=site_invariant,
                           clock=clock,
                           coalescent_model=coalescent_model,
                           skygrid_duration=skygrid_duration,
                           skygrid_intervals=skygrid_intervals)

    template = _get_template("gtr_single_partition.xml")
    template.stream(**template_kwargs).dump(control_file)

    beast_call += [str(control_file)]

    # Execute
    subprocess.run(beast_call, check=True, cwd=result.path)

    return result
Esempio n. 10
0
def site_heterogeneous_hky(
        coding_regions: qiime2.Metadata,
        noncoding_regions: qiime2.Metadata,
        time: qiime2.NumericMetadataColumn,
        n_generations: int,
        sample_every: int,
        print_every: int = None,
        time_uncertainty: qiime2.NumericMetadataColumn = None,
        use_gpu: bool = False,
        n_threads: int = 1) -> BEASTPosteriorDirFmt:

    # Parallelization options
    beast_call = ['beast']
    if use_gpu:
        if n_threads != 1:
            raise ValueError
        beast_call += ['-beagle_GPU', '-beagle_cuda', '-beagle_instances', '1']
    else:
        beast_call += [
            '-beagle_CPU', '-beagle_SSE', '-beagle_instances',
            str(n_threads)
        ]

    # Set up directory format where BEAST will write everything
    result = BEASTPosteriorDirFmt()
    control_file = str(result.control.path_maker())

    ops_file = str(result.ops.path_maker().relative_to(result.path))
    log_file = str(result.log.path_maker().relative_to(result.path))
    trees_file = str(result.trees.path_maker().relative_to(result.path))

    # Setup up samples for templating into control file
    orf_series = coding_regions.get_column('Sequence').to_series()
    nc_series = noncoding_regions.get_column('Sequence').to_series()
    time_series = time.to_series()
    uncertainty_series = time_uncertainty.to_series()

    samples_df = pd.concat(
        [orf_series, nc_series, time_series, uncertainty_series],
        axis='columns',
        join='inner')
    samples_df.index.name = 'id'
    samples_df.columns = ['seq_orf', 'seq_nc', 'time', 'time_uncertainty']
    samples_df = samples_df.replace({pd.np.nan: None})
    samples = list(samples_df.itertuples(index=True))

    # Default print behavior
    if print_every is None:
        print_every = sample_every

    # Generate control file for BEAST
    template_kwargs = dict(trees_file=trees_file,
                           ops_file=ops_file,
                           log_file=log_file,
                           sample_every=sample_every,
                           print_every=print_every,
                           n_generations=n_generations,
                           time_unit='years',
                           samples=samples)
    template = _get_template("orf_and_nc.xml")
    template.stream(**template_kwargs).dump(control_file)

    beast_call += [str(control_file)]

    # Execute
    subprocess.run(beast_call, check=True, cwd=result.path)

    return result