def test_mofa():
    D = [1000, 1000]  # Number of features per view
    M = len(D)  # Number of views
    K = 5  # Number of factors
    N = [100, 100]  # Number of samples per group
    G = len(N)  # Number of groups
    data_dt = pd.read_csv(
        "http://ftp.ebi.ac.uk/pub/databases/mofa/getting_started/data.txt.gz",
        sep="\t")
    ent = entry_point()
    ent.set_data_options(scale_groups=False, scale_views=False)
    ent.set_data_df(data_dt, likelihoods=["gaussian", "gaussian"])
    ent.set_model_options(factors=10,
                          spikeslab_weights=True,
                          spikeslab_factors=True,
                          ard_factors=True,
                          ard_weights=True)
    ent.set_train_options(iter=1000,
                          convergence_mode="fast",
                          startELBO=1,
                          freqELBO=1,
                          dropR2=0.001,
                          gpu_mode=True,
                          verbose=False,
                          seed=1)
    ent.build()

    ent.run()
    pdb.set_trace()
def run_mofa_plus(Y):
    K = 10
    G = 1
    M = 1
    D = [Y.shape[1]]
    N = [Y.shape[0]]
    data_mat = [[None for g in range(G)] for m in range(M)]
    data_mat[0][0] = np.copy(Y)
    ent = entry_point()
    ent.set_data_options(scale_groups=False,
                         use_float32=True,
                         scale_views=False)
    ent.set_data_matrix(data_mat, likelihoods=["gaussian"])

    ent.set_model_options(factors=K * 2,
                          spikeslab_weights=True,
                          ard_factors=True,
                          ard_weights=True)

    ent.set_train_options(iter=300,
                          convergence_mode="slow",
                          startELBO=1,
                          freqELBO=1,
                          dropR2=0.001,
                          nostop=True,
                          gpu_mode=True,
                          startSparsity=1,
                          verbose=False,
                          seed=1)

    ent.build()
    ent.run()
    pdb.set_trace()
    learned_U = (ent.model.nodes['Z'].getExpectations())['EN']
    learned_S_U = (ent.model.nodes['Z'].getExpectations())['EB']
    learned_V = (ent.model.nodes['W'].getExpectations())[0]['EN']
    learned_S_V = (ent.model.nodes['W'].getExpectations())[0]['EB']
    theta_w = ent.model.nodes['ThetaW'].getExpectations()[0]['E']
    theta_z = ent.model.nodes['ThetaZ'].getExpectations()['E']
    fit = {
        'V': learned_V,
        'V_S': learned_S_V,
        'V_E': learned_V * learned_S_V,
        'U': learned_U,
        'U_S': learned_S_U,
        'U_E': learned_U * learned_S_U,
        'theta_V': theta_w,
        'theta_U': theta_z
    }
    return fit
Beispiel #3
0
def train_MOFA(input_data,
               times,
               group_names,
               feature_names,
               sample_names,
               view_names,
               outfile,
               use_GP=True,
               model_groups=True,
               center_groups=False):

    from mofapy2.run.entry_point import entry_point

    # prepare MEFISTO model
    ent = entry_point()
    ent.set_data_options(center_groups=center_groups)
    ent.set_data_matrix(input_data,
                        groups_names=group_names,
                        features_names=feature_names,
                        samples_names=sample_names,
                        views_names=view_names)

    ent.set_model_options(factors=2)
    ent.set_train_options(seed=2020)

    if use_GP:
        ent.set_covariates(times, covariates_names="month")
        ent.set_smooth_options(model_groups=model_groups,
                               warping=False,
                               warping_ref=0,
                               n_grid=10,
                               opt_freq=50,
                               start_opt=50)  # opt_freq added for RCLR, set

    # Build and run the model
    ent.build()
    ent.run()

    # interpolate
    if use_GP:
        ent.predict_factor(new_covariates=ent.model.nodes["Sigma"].covariates)

    ent.save(outfile)
Beispiel #4
0
    def test_build_basic(self):
        ent = entry_point()
        ent.set_data_options(scale_groups=False, scale_views=False)
        views_names = ["view1", "view2"]
        groups_names = ["groupA", "groupB"]

        # Set dimensions
        n_g1, n_g2 = 10, 20
        d_m1, d_m2 = 30, 40
        np.random.seed(42)
        ent.set_data_matrix([
            [np.random.random((n_g1, d_m1)),
             np.random.random((n_g2, d_m1))],
            [np.random.random((n_g1, d_m2)),
             np.random.random((n_g2, d_m2))],
        ])

        ent.set_model_options()
        ent.set_train_options()
        ent.build()
Beispiel #5
0
# datafile = "/hps/nobackup2/research/stegle/users/ricard/peer/data/FullFreeze_Corrected_iPSC_TPM2_20180626_hqS.txt.gz"
datafile = "/g/stegle/ricard/peer/data/FullFreeze_Corrected_iPSC_TPM2_20180626_hqS.txt.gz"

# The data has to be loaded as a pandas dataframe or as a numpy matrix with dimensions (samples,features)
data = pd.read_csv(datafile, header=0, sep='\t', index_col=0)

# Define likelihoods: non-gaussian likelihoods are implemented (poisson and bernoulli), but by default we use gaussian.
lik = ["gaussian"]

###########################
## Initialise MOFA model ##
###########################

# initialise the entry point
ent = entry_point()

# Set data options
ent.set_data_options(likelihoods=lik)

# Set data
ent.set_data_matrix([[data]])  # do not modify this nested list

# Set model options
# - factors: number of factors
# - spikeslab_weights: use spike-and-slab sparsity on the loading?
# - ard_weights: use ARD prior on the loadings (please do not edit this)
ent.set_model_options(factors=100,
                      spikeslab_weights=False,
                      ard_weights=False,
                      likelihoods=lik)
Beispiel #6
0
def mofa(
    data: Union[AnnData, MuData],
    groups_label: bool = None,
    use_raw: bool = False,
    use_layer: bool = None,
    use_var: Optional[str] = "highly_variable",
    use_obs: Optional[str] = None,
    likelihoods: Optional[Union[str, List[str]]] = None,
    n_factors: int = 10,
    scale_views: bool = False,
    scale_groups: bool = False,
    center_groups: bool = True,
    ard_weights: bool = True,
    ard_factors: bool = True,
    spikeslab_weights: bool = True,
    spikeslab_factors: bool = False,
    n_iterations: int = 1000,
    convergence_mode: str = "fast",
    use_float32: bool = False,
    gpu_mode: bool = False,
    svi_mode: bool = False,
    svi_batch_size: float = 0.5,
    svi_learning_rate: float = 1.0,
    svi_forgetting_rate: float = 0.5,
    svi_start_stochastic: int = 1,
    smooth_covariate: Optional[str] = None,
    smooth_warping: bool = False,
    smooth_kwargs: Optional[Mapping[str, Any]] = None,
    save_parameters: bool = False,
    save_data: bool = True,
    save_metadata: bool = True,
    seed: int = 1,
    outfile: Optional[str] = None,
    expectations: Optional[List[str]] = None,
    save_interrupted: bool = True,
    verbose: bool = False,
    quiet: bool = True,
    copy: bool = False,
):
    """
    Run Multi-Omics Factor Analysis

    PARAMETERS
    ----------
    data
            an MuData object
    groups_label : optional
            a column name in adata.obs for grouping the samples
    use_raw : optional
            use raw slot of AnnData as input values
    use_layer : optional
            use a specific layer of AnnData as input values (supersedes use_raw option)
    use_var : optional
            .var column with a boolean value to select genes (e.g. "highly_variable"), None by default
    use_obs : optional
            strategy to deal with samples (cells) not being the same across modalities ("union" or "intersection", throw error by default)
    likelihoods : optional
            likelihoods to use, default is guessed from the data
    n_factors : optional
            number of factors to train the model with
    scale_views : optional
            scale views to unit variance
    scale_groups : optional
            scale groups to unit variance
    center_groups : optional
            center groups to zero mean (True by default)
    ard_weights : optional
            use view-wise sparsity
    ard_factors : optional
            use group-wise sparsity
    spikeslab_weights : optional
            use feature-wise sparsity (e.g. gene-wise)
    spikeslab_factors : optional
            use sample-wise sparsity (e.g. cell-wise)
    n_iterations : optional
            upper limit on the number of iterations
    convergence_mode : optional
            fast, medium, or slow convergence mode
    use_float32 : optional
            use reduced precision (float32)
    gpu_mode : optional
            if to use GPU mode
    svi_mode : optional
            if to use Stochastic Variational Inference (SVI)
    svi_batch_size : optional
            batch size as a fraction (only applicable when svi_mode=True, 0.5 by default)
    svi_learning_rate : optional
            learning rate (only applicable when svi_mode=True, 1.0 by default)
    svi_forgetting_rate : optional
            forgetting_rate (only applicable when svi_mode=True, 0.5 by default)
    svi_start_stochastic : optional
            first iteration to start SVI (only applicable when svi_mode=True, 1 by default)
    smooth_covariate : optional
            use a covariate (column in .obs) to learn smooth factors (MEFISTO)
    smooth_warping : optional
            if to learn the alignment of covariates (e.g. time points) from different groups;
            by default, the first group is used as a reference, which can be adjusted by setting
            the REF_GROUP in smooth_kwargs = { "warping_ref": REF_GROUP } (MEFISTO)
    smooth_kwargs : optional
            additional arguments for MEFISTO (covariates_names, scale_cov, start_opt, n_grid, opt_freq,
            warping_freq, warping_ref, warping_open_begin, warping_open_end,
            sparseGP, frac_inducing, model_groups, new_values)
    save_parameters : optional
            if to save training parameters
    save_data : optional
            if to save training data
    save_metadata : optional
            if to load metadata from the AnnData object (.obs and .var tables) and save it, False by default
    seed : optional
            random seed
    outfile : optional
            path to HDF5 file to store the model
    expectations : optional
            which nodes should be used to save expectations for (will save only W and Z by default);
    possible expectations names
            nclude Y, W, Z, Tau, AlphaZ, AlphaW, ThetaW, ThetaZ
    save_interrupted : optional
            if to save partially trained model when the training is interrupted
    verbose : optional
            print verbose information during traing
    quiet : optional
            silence messages during training procedure
    copy : optional
            return a copy of AnnData instead of writing to the provided object
    """

    try:
        from mofapy2.run.entry_point import entry_point
    except ImportError:
        raise ImportError(
            "MOFA+ is not available. Install MOFA+ from PyPI (`pip install mofapy2`) or from GitHub (`pip install git+https://github.com/bioFAM/MOFA2`)"
        )

    if isinstance(data, AnnData):
        logging.info("Wrapping an AnnData object into an MuData container")
        mdata = MuData(data)
        # Modality name is used as a prefix by default
        mdata.obs = data.obs
    elif isinstance(data, MuData):
        mdata = data
    else:
        raise TypeError("Expected an MuData object")

    if outfile is None:
        outfile = os.path.join("/tmp", "mofa_{}.hdf5".format(strftime("%Y%m%d-%H%M%S")))

    if use_var:
        if use_var not in data.var.columns:
            warn(f"There is no column {use_var} in the provided object")
            use_var = None

    if isinstance(data, MuData):
        common_obs = reduce(np.intersect1d, [v.obs_names.values for k, v in mdata.mod.items()])
        if len(common_obs) != mdata.n_obs:
            if not use_obs:
                raise IndexError(
                    "Not all the observations are the same across modalities. Please run `mdata.intersect_obs()` to subset the data or devise a strategy with `use_obs` ('union' or 'intersection')"
                )
            elif use_obs not in ["union", "intersection"]:
                raise ValueError(
                    f"Expected `use_obs` argument to be 'union' or 'intersection', not '{use_obs}'"
                )
        else:
            use_obs = None

    ent = entry_point()

    lik = likelihoods
    if lik is not None:
        if isinstance(lik, str) and isinstance(lik, Iterable):
            lik = [lik for _ in range(len(mdata.mod))]

    ent.set_data_options(
        scale_views=scale_views,
        scale_groups=scale_groups,
        center_groups=center_groups,
        use_float32=use_float32,
    )
    logging.info(
        f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Setting data from MuData object..."
    )
    _set_mofa_data_from_mudata(
        model=ent,
        mdata=mdata,
        groups_label=groups_label,
        use_raw=use_raw,
        use_layer=use_layer,
        likelihoods=lik,
        features_subset=use_var,
        save_metadata=save_metadata,
        use_obs=use_obs,
    )
    logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Setting model options...")
    ent.set_model_options(
        ard_factors=ard_factors,
        ard_weights=ard_weights,
        spikeslab_weights=spikeslab_weights,
        spikeslab_factors=spikeslab_factors,
        factors=n_factors,
    )
    logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Setting training options...")
    ent.set_train_options(
        iter=n_iterations,
        convergence_mode=convergence_mode,
        gpu_mode=gpu_mode,
        seed=seed,
        verbose=verbose,
        quiet=quiet,
        outfile=outfile,
        save_interrupted=save_interrupted,
    )

    if svi_mode:
        logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Setting up SVI...")
        ent.set_stochastic_options(
            learning_rate=svi_learning_rate,
            forgetting_rate=svi_forgetting_rate,
            batch_size=svi_batch_size,
            start_stochastic=svi_start_stochastic,
        )

    # MEFISTO options

    smooth_kwargs_default = dict(
        covariates_names=smooth_covariate,
        scale_cov=False,
        start_opt=20,
        n_grid=20,
        opt_freq=10,
        model_groups=True,
        warping_freq=20,
        warping_ref=0,
        warping_open_begin=True,
        warping_open_end=True,
        sparseGP=False,
        frac_inducing=None,
        new_values=None,
    )

    if not smooth_kwargs:
        smooth_kwargs = {}

    # warping_ref has to be an integer
    if "warping_ref" in smooth_kwargs:
        warping_ref = smooth_kwargs["warping_ref"]
        if not (isinstance("warping_ref", int)):
            warping_ref = np.where(np.array(ent.data_opts["groups_names"]) == warping_ref)[0]
            if len(warping_ref) == 0:
                raise KeyError(
                    f"Expected 'warping_ref' for be a group name but there is no group {warping_ref}"
                )
            smooth_kwargs["warping_ref"] = warping_ref[0]

    # Add default options where they are not provided
    smooth_kwargs = {**smooth_kwargs_default, **smooth_kwargs}

    if smooth_covariate is not None:
        logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Adding smooth options...")
        ent.set_covariates(smooth_covariate, covariates_names=smooth_kwargs["covariates_names"])
        ent.set_smooth_options(
            scale_cov=smooth_kwargs["scale_cov"],
            start_opt=smooth_kwargs["start_opt"],
            n_grid=smooth_kwargs["n_grid"],
            opt_freq=smooth_kwargs["opt_freq"],
            model_groups=smooth_kwargs["model_groups"],
            warping=smooth_warping,
            warping_freq=smooth_kwargs["warping_freq"],
            warping_ref=smooth_kwargs["warping_ref"],
            warping_open_begin=smooth_kwargs["warping_open_begin"],
            warping_open_end=smooth_kwargs["warping_open_end"],
            sparseGP=smooth_kwargs["sparseGP"],
            frac_inducing=smooth_kwargs["frac_inducing"],
        )

    logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Building the model...")
    ent.build()
    logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Running the model...")
    ent.run()

    if (
        smooth_kwargs is not None
        and "new_values" in smooth_kwargs
        and smooth_kwargs["new_values"]
        and smooth_covariate
    ):
        logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Interpolating factors...")
        new_values = np.array(smooth_kwargs["new_values"])
        if new_values.ndim == 1:
            new_values = new_values.reshape(-1, 1)
        ent.predict_factor(new_covariates=new_values)

    logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Saving the model...")
    ent.save(
        outfile, save_data=save_data, save_parameters=save_parameters, expectations=expectations
    )

    f = h5py.File(outfile, "r")
    if copy:
        data = data.copy()

    # Factors
    z = np.concatenate([v[:, :] for k, v in f["expectations"]["Z"].items()], axis=1).T

    # Samples are grouped per sample group
    # so the rows of the Z matrix have to be re-ordered
    if groups_label:
        zs = np.concatenate([v[:] for k, v in f["samples"].items()], axis=0).astype(str)

    if use_obs and use_obs == "intersection":  # data is MuData and common_obs is available
        if groups_label:
            z = pd.DataFrame(z, index=zs).loc[common_obs].to_numpy()
        # Set factor values outside of the obs intersection to nan
        data.obsm["X_mofa"] = np.empty(shape=(data.n_obs, z.shape[1]))
        data.obsm["X_mofa"][:] = np.nan
        # Samples
        data.obsm["X_mofa"][data.obs.index.isin(common_obs)] = z
    else:
        if groups_label:
            z = pd.DataFrame(z, index=zs).loc[mdata.obs.index.values].to_numpy()
        data.obsm["X_mofa"] = z

    # Weights
    w = np.concatenate([v[:, :] for k, v in f["expectations"]["W"].items()], axis=1).T
    if use_var:
        # Set the weights of features that were not used to zero
        data.varm["LFs"] = np.zeros(shape=(data.n_vars, w.shape[1]))
        data.varm["LFs"][data.var[use_var]] = w
    else:
        data.varm["LFs"] = w

    # Aligned times
    if smooth_covariate is not None and smooth_warping:
        for c in range(ent.dimensionalities["C"]):
            cnm = ent.smooth_opts["covariates_names"][c] + "_warped"
            cval = ent.model.getNodes()["Sigma"].sample_cov_transformed[:, c]
            if groups_label:
                cval = pd.DataFrame(cval, index=zs).loc[common_obs].to_numpy()
            data.obs[cnm] = cval

    # Parameters
    data.uns["mofa"] = {
        "params": {
            "data": {
                "groups_label": groups_label,
                "use_raw": use_raw,
                "use_layer": use_layer,
                "likelihoods": f["model_options"]["likelihoods"][:].astype(str),
                "features_subset": use_var,
                "use_obs": use_obs,
                "scale_views": scale_views,
                "scale_groups": scale_groups,
                "center_groups": center_groups,
                "use_float32": use_float32,
            },
            "model": {
                "ard_factors": ard_factors,
                "ard_weights": ard_weights,
                "spikeslab_weights": spikeslab_weights,
                "spikeslab_factors": spikeslab_factors,
                "n_factors": n_factors,
            },
            "training": {
                "n_iterations": n_iterations,
                "convergence_mode": convergence_mode,
                "gpu_mode": gpu_mode,
                "seed": seed,
            },
        }
    }

    # Variance explained
    try:
        views = f["views"]["views"][:].astype(str)
        variance_per_group = f["variance_explained"]["r2_per_factor"]
        variance = {m: {} for m in views}

        groups = f["groups"]["groups"][:].astype(str)
        if len(groups) > 1:
            for group in list(variance_per_group.keys()):
                for i, view in enumerate(views):
                    variance[view][group] = variance_per_group[group][i, :]
        else:
            for i, view in enumerate(views):
                variance[view] = variance_per_group[groups[0]][i, :]
        data.uns["mofa"]["variance"] = variance
    except:
        warn("Cannot save variance estimates")

    f.close()

    if copy:
        return data
    else:
        print("Saved MOFA embeddings in .obsm['X_mofa'] slot and their loadings in .varm['LFs'].")

    return None
Beispiel #7
0
def run_grid(nfactors = 3, G = 5, N = 20, Dm = 500, noise_level = 1, missing = 0.1, missing_all = 0.1, seed = 1234567,
              method = "MEFISTO", note = "none", lscales = [0.2, 0.1, 0.0], scales = [1, 0.6, 0], M = 4, plot = False,
             max_iter = 1000, verbose = False, sparse_frac = 0.75, warp = False, save = False, group_differences = True,
             model_groups = True):

    nfactors = int(nfactors)
    assert len(lscales) == nfactors
    assert len(scales) == nfactors

    groupsidx = np.repeat(range(G), N)

    # simulate data
    np.random.seed(seed)
    if group_differences:
        if nfactors == 3:
            sharedness = np.random.choice([True, False], 2, replace=False).tolist() + [False] # one shared, one non-shared, one non-smooth
        else:
            sharedness = True # not in use
    else:
        sharedness = True
    sim = simmofa.simulate_data(N=N, seed=seed, views=["0", "1", "2", "3"], D=[Dm] * M,
                                K=nfactors, G=G, lscales=lscales, noise_level=noise_level,
                                scales = scales, shared = sharedness)

    # mask parts of the data
    data_full = copy.deepcopy(sim['data'])
    sim['data'] = simmofa.mask_samples(sim, perc = missing, perc_all_views = missing_all)

    # misalign covariates between groups
    if warp:
        assert G == 3, "Warping defined only for G=3"
        sim['sample_cov'][1] = np.exp(sim['sample_cov'][1])
        sim['sample_cov'][2] = 0.4 * sim['sample_cov'][2] + 0.3
    
    # optional plotting of simulated factors
    if plot:
        fig, axs = plt.subplots(1, nfactors)
        Zsim = sim['Z']
        for g in range(G):
            for i in range(nfactors):
                axs[i].scatter(sim['sample_cov'][g], Zsim[g][:, i])
                axs[i].set_title("simulated factors")

    # prepare model
    ent = entry_point()
    ent.set_data_options(scale_views=False)
    ent.set_data_matrix(sim['data'])
    ent.set_model_options(factors=nfactors)
    ent.set_train_options(seed=2020, convergence_mode="fast", iter=max_iter, verbose=verbose)

    # for time-aware multi-modal FA with GP model add covariates
    if not method == "MOFA2":
        ent.set_covariates(sim['sample_cov'])
        if method == "MEFISTO+align":
            ent.set_smooth_options(warping=True, model_groups = model_groups)
        elif method == "MEFISTO_sparse":
            ent.set_smooth_options(model_groups = model_groups, sparseGP = True, n_inducing= int((N * G) * sparse_frac))
        else:
            ent.set_smooth_options(model_groups = model_groups)


    # run and build the model
    tracemalloc.start()
    ent.build()
    t0 = time.time()
    ent.run()
    t1 = time.time()
    total = t1 - t0
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # get inferred hyperparameters
    if method != "MOFA2":
        scales_learnt = ent.model.train_stats['scales']
        lscales_learnt = ent.model.train_stats['length_scales']
    else:
        scales_learnt = np.array([np.nan] * nfactors)
        lscales_learnt = np.array([np.nan] * nfactors)

    # get factors recovery error
    Zlearnt = ent.model.getExpectations()['Z']['E']

    # calculate factor recovery error    
    # if not right number of factors inferred set to nan
    if not Zlearnt.shape[1] == nfactors: 
        factor_r2 = np.nan
        scales_learnt = [np.nan] * nfactors
        lscales_learnt = [np.nan] * nfactors
        post_var = [np.nan] * nfactors
        factor_idx = [np.nan] * nfactors
    else:
        Zsim = np.vstack(sim['Z'])
         # get idx of learnt factor corresponding to simulated factor by maximal correlation
        factor_idx = [np.argmax([abs(ss.pearsonr(Zsim[:,p], Zlearnt[:,pp])[0]) for pp in range(nfactors)]) for p in range(nfactors)]
        #check for duplicates - if true not all facotrs are captured on a unique factor
        if not len(factor_idx) == len(set(factor_idx)): 
            factor_r2 = np.nan
        else:
            # calculate correlation between inferred and simulated factors 
            factor_r2 = np.mean([np.max([abs(ss.pearsonr(Zsim[:, pp], Zlearnt[:, p])[0]) for pp in range(nfactors)]) for p in
         range(nfactors)]) ** 2
            if method != "MOFA2":
                scales_learnt = scales_learnt[factor_idx] # match to simulated factor
                lscales_learnt = lscales_learnt[factor_idx]
                if verbose:
                    print(scales_learnt)
        
        # get posterior variance
        post_var = ent.model.getExpectations()['Z']['E2'] - (ent.model.getExpectations()['Z']['E']) ** 2
        post_var = post_var.mean(axis = 0)

    # get imputation error
    ent.impute(mask_outliers = False)
    mse = 0
    n_missing = 0
    imp_r2 = 1
    if missing + missing_all > 0:
        for m in range(M):
            mse_m = ((ent.imputed_data["mean"][m][ent.model.nodes['Y'].getNodes()[m].getMask()] - np.vstack(data_full[m])[ent.model.nodes['Y'].getNodes()[m].getMask()])**2).sum()
            mse = mse + mse_m
            n_missing = n_missing + ent.model.nodes['Y'].getNodes()[m].getMask().sum()
        mse = mse / n_missing

        imp_r2 = np.mean([ss.pearsonr(np.vstack(data_full[m])[ent.model.nodes['Y'].getNodes()[m].getMask()].flatten(), ent.imputed_data["mean"][m][ent.model.nodes['Y'].getNodes()[m].getMask()].flatten())[0] ** 2 for m in range(M)]) 
    rec_r2 = np.mean([ss.pearsonr(np.vstack(data_full[m]).flatten(), ent.imputed_data["mean"][m].flatten())[0]  ** 2 for m in range(M)] )

    # get warping error
    if method == "MEFISTO+align":
        sample_cov_transformed = ent.model.getNodes()['Sigma'].sample_cov_transformed
        # compare to untransformed group
        warp_mse = sum([sum((sample_cov_transformed[groupsidx == g] - sim['sample_cov'][0])**2) for g in range(G)]) / (N * G)

    else: # no transformation made
        warp_mse = sum([sum((sim['sample_cov'][g] - sim['sample_cov'][0])**2) for g in range(G)]) / (N * G)
   
    warp_mse = warp_mse[0]

    # get group covariance error:
    if group_differences and len(factor_idx) == len(set(factor_idx)):
        if "Sigma" in ent.model.nodes.keys() and model_groups:
            Gmat_learnt = ent.model.nodes['Sigma'].getParameters()['Kg']
        # MEFISTO without model_groups assumes all groups to be connected
        elif "Sigma" in ent.model.nodes.keys():
            Gmat_learnt = [np.ones([G,G])] * nfactors
        # MOFA2 assumes all groups to be unconnected
        else:
            Gmat_learnt = [np.eye(G)] * nfactors

        # get sharedness error
        true_sharedness = [np.mean(np.abs(sim['Gmats'][k] - np.eye(G))[np.triu_indices(G, 1)]) for k in range(nfactors)]
        inferred_sharedness = [np.mean(np.abs(Gmat_learnt[factor_idx[k]] - np.eye(G))[np.triu_indices(G, 1)]) for k in
                               range(nfactors)]

    # if no group covariance was simulated set to nan     
    else:
        true_sharedness = [np.nan] * nfactors
        inferred_sharedness = [np.nan] * nfactors

    # write output to csv
    results = {'factor_r2' : factor_r2, 'time':  total, 'method' : method, 'model_groups' : model_groups,
                'group_differences': group_differences, 'N' : N, 'G': G,
               'Dm' : Dm, 'noise_level': noise_level, 'missing' : missing + missing_all, 'seed' : seed,
               'date' : date.today(), 'note' : note, 'mem_usage': peak, 'lscales' : lscales,
               'scales' : scales, 'sparse_frac' : sparse_frac,
               'n_factors' : nfactors,
               'warp_mse' : warp_mse,
               'n_factors_learnt' : Zlearnt.shape[1],
               'scales_learnt' : scales_learnt,
               'lscales_learnt' : lscales_learnt,
               'true_sharedness' : np.array(true_sharedness),
               'inferred_sharedness' : np.array(inferred_sharedness),
               'post_var' : post_var, 'mse' : mse,  'imp_r2' : imp_r2, 'rec_r2' : rec_r2}
    if verbose:
        print(results)

    df = pd.DataFrame.from_dict(data=results, orient='index').T
    # expand multi-factor columns
    for nm in ['scales', 'lscales', 'scales_learnt', 'lscales_learnt', 'true_sharedness', 'inferred_sharedness', 'post_var']:
        dfsplit = df[nm].apply(pd.Series)
        dfsplit = dfsplit.rename(columns=lambda x: nm + "_" + str(x))
        df = pd.concat([df, dfsplit], axis=1)
        df = df.drop(columns = [nm])

    # optional plotting of inferred factors
    if plot:
        Zlearnt = ent.model.getExpectations()['Z']['E']
        fig, axs = plt.subplots(1, nfactors)
        for g in range(G):
            for i in range(nfactors):
                axs[i].scatter(sim['sample_cov'][g], Zlearnt[groupsidx == g, i])
                axs[i].set_title("inferred factors")

    # save summary statistics if not the model itself is saved
    if not save:
        if os.path.exists('out/simulation_results.csv'):
            df.to_csv('out/simulation_results.csv', mode='a', header=False)
        else:
            df.to_csv('out/simulation_results.csv', header=True)

    else:
        ent.save("out/grid_model.hdf5")
Beispiel #8
0
args = parser.parse_args()

if type(args.logfile) == str:
    logging.basicConfig(level=logging.INFO, filename=args.logfile)
else:
    logging.basicConfig(level=logging.INFO)

logging.info("Setting up numpy multi-threading. Using {} threads".format(
    args.use_threads))
os.environ['OPENBLAS_NUM_THREADS'] = str(args.use_threads)

logging.info("Reading in ADT counts matrices")

# initial the model entry point
mofa_ent = entry_point()

# each matrix should be gzipped
file_list = args.counts_files.split(",")

# read in the donor data frames as dict for reference
# must be the same number of files unless running joint mode
logging.info("Readining in donor information: {}".format(args.donor_files))
donor_list = args.donor_files.split(",")

if len(donor_list) != len(file_list):
    if args.omic == 'joint' and len(donor_list) == 1:
        donor_names = ["Gene", "ADT"]
        donor_df_list = [
            pd.read_table(D, sep="\t", header=0, index_col='CellID')
            for D in donor_list
Beispiel #9
0
    def __init__(
        self,
        views=None,
        groupby=None,
        likelihoods=None,
        factors_n=10,
        covariates=None,
        fit_intercept=True,
        scale_views=True,
        scale_groups=True,
        iterations=1000,
        convergence_mode="slow",
        use_overlap=True,
        startELBO=1,
        freqELBO=1,
        dropR2=None,
        verbose=1,
        from_file=None,
    ):
        """
        This is a wrapper of MOFA to perform multi-omics integrations of the GDSC data-sets. 
        
            - Multiple groups are NOT supported
            - Only samples part of all views are considered
        
        :param views: dict(str: pandas.DataFrame)
        """

        self.verbose = verbose
        self.from_file = from_file

        self.factors_n = factors_n
        self.factors_labels = [f"F{i + 1}" for i in range(factors_n)]

        self.likelihoods = likelihoods

        self.views = views
        self.scale_views = scale_views
        self.scale_groups = scale_groups
        self.views_labels = list(self.views)

        self.use_overlap = use_overlap

        self.iterations = iterations
        self.convergence_mode = convergence_mode
        self.startELBO = startELBO
        self.freqELBO = freqELBO
        self.dropR2 = dropR2

        # Covariates
        self.covariates = covariates
        self.fit_intercept = fit_intercept

        # Samples
        self.samples = set.intersection(
            *[set(self.views[v]) for v in self.views_labels])

        if self.covariates is not None:
            LOG.info(f"Covariates provided N={len(self.covariates)}")
            for k, v in self.covariates.items():
                self.samples = self.samples.intersection(set(v.index))

        self.samples = list(self.samples)
        LOG.info(f"Overlaping samples: {len(self.samples)}")

        # Reduce to overlaping samples
        if self.use_overlap:
            for k, df in self.views.items():
                self.views[k] = df[self.samples]

        # Info
        for k, df in self.views.items():
            LOG.info(f"View {k}: {df.shape}")

        # Regress-out covariates
        if self.covariates is not None:
            self.views = self.regress_out_covariates(
                fit_intercept=self.fit_intercept)

        # RUN MOFA
        # Prepare data, melt & tidy
        self.data = []

        for k in self.views_labels:
            df = self.views[k].copy()
            df.index.name = "feature"
            df.columns.name = "sample"

            df = df.unstack().rename("value").reset_index()

            if groupby is not None:
                df["group"] = groupby.reindex(df["sample"]).values

            else:
                df = df.assign(group="gdsc")

            self.data.append(df.assign(view=k))

        self.data = pd.concat(self.data, ignore_index=True)
        self.data = self.data[["sample", "group", "feature", "value",
                               "view"]].dropna()

        # Initialise entry point
        self.ep = entry_point()

        # Set data options
        self.ep.set_data_options(scale_groups=self.scale_groups,
                                 scale_views=self.scale_views)
        self.ep.set_data_df(self.data, likelihoods=self.likelihoods)

        # Set model options
        self.ep.set_model_options(factors=self.factors_n)

        # Set training options
        self.ep.set_train_options(
            iter=self.iterations,
            convergence_mode=self.convergence_mode,
            startELBO=self.startELBO,
            freqELBO=self.freqELBO,
            dropR2=self.dropR2,
            verbose=verbose > 0,
        )

        # Run MOFA
        self.ep.build()

        if not os.path.isfile(self.from_file):
            self.ep.run()
            self.save_hdf5(self.from_file)

        self.mofa_file = h5py.File(self.from_file, "r")

        self.factors = self.get_factors(self.mofa_file)
        self.weights = self.get_weights(self.mofa_file)
        self.rsquare = self.get_rsquare(self.mofa_file)
Beispiel #10
0
def run_evodevo(nfactors=5,
                Ndown=3,
                warp=False,
                save=True,
                warping_ref="Mouse",
                sample_seed=4891,
                seed=2020,
                species=["Mouse", "Rabbit", "Rat", "Human", "Opossum"],
                views=["Brain", "Cerebellum", "Heart", "Liver", "Testis"],
                model_groups=True,
                nm=None,
                tissue_as_sample=False):

    if tissue_as_sample:
        assert not warp, "Need to adapt warping reference if tissues are treated as groups"

    # specify data directory of normalized gene expression data
    if species == ["Mouse", "Rabbit", "Rat"] and not warp:
        nmtmp = "MRRab"
        datadir = "data/input_data/MRRab_matched/"
    elif warp:
        nmtmp = "warping"
        datadir = "data/input_data/all_unmatched/"
    else:
        print("Matched inputs are only provided for [Mouse, Rabbit, Rat]")
        sys.exit()

    # set filenames for output
    if nm is not None:
        nm = nm
    else:
        nm = nmtmp

    # load data and covariate
    data = []
    times = []
    samples_names = []
    if tissue_as_sample:
        group_names = []
        data_view = []
        for m in views:
            for g in species:
                df = pd.read_csv(datadir + "view_" + m + "_group_" + g +
                                 ".csv",
                                 header=0,
                                 index_col=0)
                data_view.append(np.asarray(df).transpose())
                times.append(
                    np.asarray(
                        pd.read_csv(datadir + "times_group_" + g + ".csv",
                                    header=0,
                                    index_col=0)).transpose())
                samples_names.append(df.columns)
                group_names.append(m + "-" + g)
        data = [data_view]
        features_names = [df.index]
    else:
        for m in views:
            data_view = []
            for g in species:
                data_view.append(
                    np.asarray(
                        pd.read_csv(datadir + "view_" + m + "_group_" + g +
                                    ".csv",
                                    header=0,
                                    index_col=0)).transpose())
                if m == "Brain":  # only needed once
                    times.append(
                        np.asarray(
                            pd.read_csv(datadir + "times_group_" + g + ".csv",
                                        header=0,
                                        index_col=0)).transpose())
            data.append(data_view)

    # convert warping ref to numeric
    warping_ref = np.where(
        [species[i] == warping_ref for i in range(len(species))])[0][0]

    # mask values at random
    if Ndown > 0:
        np.random.seed(sample_seed)
        if tissue_as_sample:
            for i in range(len(data[0])):
                Ng = data[0][i].shape[0]
                masked_samples = np.random.choice(Ng, Ndown, replace=False)
                data[0][i][masked_samples, :] = np.nan
        else:
            for m in range(len(views)):
                for g in range(len(species)):
                    Ng = data[m][g].shape[0]
                    masked_samples = np.random.choice(Ng, Ndown, replace=False)
                    data[m][g][masked_samples, :] = np.nan

    # check dimension and name views and groups
    if tissue_as_sample:
        assert len(data) == 1, "problem in loading data, wrong number of views"
        assert len(data[0]) == len(species) * len(
            views), "problem in loading data, wrong number of groups"
        view_names = ["mRNA"]
    else:
        assert len(data) == len(
            views), "problem in loading data, wrong number of views"
        assert len(data[0]) == len(
            species), "problem in loading data, wrong number of groups"
        view_names = views
        group_names = species

    # prepare MOFA model with time as covariate
    ent = entry_point()
    ent.set_data_options()
    ent.set_data_matrix(data, groups_names=group_names, views_names=view_names)
    ent.set_model_options(factors=nfactors)
    ent.set_train_options(seed=seed, convergence_mode="medium")
    ent.set_covariates(times, covariates_names="time")
    ent.set_smooth_options(warping=warp,
                           warping_ref=warping_ref,
                           model_groups=model_groups)

    # Build and run the model
    tracemalloc.start()
    ent.build()
    t0 = time.time()
    ent.run()
    t1 = time.time()
    total = t1 - t0
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # save model
    if save:
        if Ndown == 0:
            if model_groups:
                outfile = "out/evodevo_groups_%s-seed_%s.hdf5" % (nm, seed)
            else:
                outfile = "out/evodevo_%s-seed_%s.hdf5" % (nm, seed)

            # interpolate for missing time points
            ent.predict_factor(
                new_covariates=ent.model.nodes["Sigma"].covariates)

        else:
            if model_groups:
                outfile = "out/evodevo_groups_%s-N%s-sample_seed_%s.hdf5" % (
                    nm, Ndown, sample_seed)
            else:
                outfile = "out/evodevo_%s-N%s-sample_seed_%s.hdf5" % (
                    nm, Ndown, sample_seed)

        ent.save(outfile)

    # write output to csv
    results = {
        'time': total,
        'mem_usage': peak,
        'n_down': Ndown,
        'sample_seed': sample_seed,
        'seed': seed
    }
    df = pd.DataFrame.from_dict(data=results, orient='index').T
    if model_groups:
        stats_file = 'out/evodevo_groups_%s_stats.csv' % nm
    else:
        stats_file = 'out/evodevo_%s_stats.csv' % nm
    if os.path.exists(stats_file):
        df.to_csv(stats_file, mode='a', header=False)
    else:
        df.to_csv(stats_file, header=True)