Python clean_dataset Examples, utils.clean_dataset Python Examples

Example #1

0

Show file

def main(cmd_args):
    dataset = cb.data.ExprDataSet.read_dataset(
        cmd_args.input, sparsify=True).normalize(
            target=100000)  # Example data seem to be normalized to 100,000
    if cmd_args.clean:
        dataset = utils.clean_dataset(dataset, cmd_args.clean)
    if cmd_args.genes is not None:
        dataset = dataset[:, dataset.uns[cmd_args.genes]]
    dataset = np.log2(dataset.exprs.toarray() + 1)
    mat_file = os.path.join(cmd_args.output_path, "matrix.txt.gz")
    res_file = os.path.join(cmd_args.output_path, "output_datafile")
    np.savetxt(mat_file, dataset)
    start_time = time.time()
    Dhaka.Dhaka(mat_file,
                latent_dim=cmd_args.n_latent,
                N_starts=1,
                epochs=cmd_args.n_epochs,
                output_datafile=res_file,
                to_cluster=0,
                gene_selection=0,
                to_plot=0,
                relative_expression=0)
    cb.data.write_hybrid_path(time.time() - start_time,
                              "//".join([cmd_args.output, "time"]))
    cb.data.write_hybrid_path(np.loadtxt(res_file + ".txt"),
                              "//".join([cmd_args.output, "latent"]))
    os.remove(mat_file)
    os.remove(res_file + ".txt")

Example #2

0

Show file

def main(cmd_args):
    cb.utils.logger.info("Reading data...")
    dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input)
    if cmd_args.clean:
        dataset = utils.clean_dataset(dataset, cmd_args.clean)
    model = cb.directi.DIRECTi.load(cmd_args.model)
    data_dict = {
        "exprs": dataset[:, model.genes].exprs,
        "library_size": np.array(dataset.exprs.sum(axis=1)).reshape((-1, 1))
    }
    start_time = time.time()
    if cmd_args.target == "zeros":
        data_dict[cmd_args.batch_effect] = np.zeros(
            (dataset.shape[0],
             np.unique(dataset.obs[cmd_args.batch_effect]).size))
    elif cmd_args.target == "first":
        data_dict[cmd_args.batch_effect] = cb.utils.encode_onehot(
            dataset.obs["dataset_name"].astype(object).fillna("IgNoRe"),
            sort=True,
            ignore="IgNoRe").toarray()
        data_dict[cmd_args.batch_effect][:, 0] = 1.0
        data_dict[cmd_args.batch_effect][:, 1:] = 0.0
    else:  # cmd_args.target == "ones":
        data_dict[cmd_args.batch_effect] = np.ones(
            (dataset.shape[0],
             np.unique(dataset.obs[cmd_args.batch_effect]).size))
    corrected = model._fetch(model.prob_module.softmax_mu,
                             cb.utils.DataDict(data_dict))
    cb.data.write_hybrid_path(time.time() - start_time,
                              f"{cmd_args.output}//time")
    cb.data.write_hybrid_path(corrected, f"{cmd_args.output}//exprs")

Example #3

0

Show file

File: run_DCA_modpp.py Project: BacemDataScience/Cell_BLAST

def main(cmd_args):
    dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input, sparsify=True)
    if cmd_args.clean is not None:
        dataset = utils.clean_dataset(dataset, cmd_args.clean)
    if cmd_args.genes is not None:
        genes = dataset.uns[cmd_args.genes]
    else:
        genes = None
    dataset = dataset.to_anndata()
    start_time = time.time()
    dataset, model = dca_modpp.api.dca(
        dataset,
        genes,
        mode="latent",
        normalize_per_cell=10000,
        scale=False,
        hidden_size=(cmd_args.n_hidden, ) * cmd_args.n_layers +
        (cmd_args.n_latent, ) + (cmd_args.n_hidden, ) * cmd_args.n_layers,
        epochs=cmd_args.n_epochs,
        early_stop=cmd_args.patience,
        random_state=cmd_args.seed,
        threads=cmd_args.threads,
        return_model=True,
        copy=True)
    cb.data.write_hybrid_path(time.time() - start_time,
                              "//".join([cmd_args.output, "time"]))
    cb.data.write_hybrid_path(dataset.obsm["X_dca"],
                              "//".join([cmd_args.output, "latent"]))
    model.encoder.save(os.path.join(cmd_args.output_path, "model.h5"))
    np.savetxt(os.path.join(cmd_args.output_path, "genes.txt"), genes, "%s")

Example #4

0

Show file

def main(cmd_args):

    cb.message.info("Loading index...")
    os.environ["CUDA_VISIBLE_DEVICES"] = utils.pick_gpu_lowest_memory() \
        if cmd_args.device is None else cmd_args.device
    blast = cb.blast.BLAST.load(cmd_args.index)
    if cmd_args.subsample_ref is not None:
        cb.message.info("Subsampling reference...")
        subsample_idx = np.random.RandomState(cmd_args.seed).choice(
            blast.ref.shape[0], cmd_args.subsample_ref, replace=False)
        blast.ref = blast.ref[subsample_idx, :]
        blast.latent = blast.latent[
            subsample_idx] if blast.latent is not None else None
        blast.cluster = blast.cluster[
            subsample_idx] if blast.cluster is not None else None
        blast.posterior = blast.posterior[
            subsample_idx] if blast.posterior is not None else None
        blast.nearest_neighbors = None
        blast.empirical = None
        blast._force_components()

    cb.message.info("Reading query...")
    query = cb.data.ExprDataSet.read_dataset(cmd_args.query)
    if cmd_args.clean:
        query = utils.clean_dataset(query, cmd_args.clean)

    if cmd_args.align:
        cb.message.info("Aligning...")
        unipath = "/tmp/cb/" + cb.utils.rand_hex()
        cb.message.info("Using temporary path: " + unipath)
        blast = blast.align(query, path=unipath)

    cb.message.info("BLASTing...")
    start_time = time.time()
    hits = blast.query(query,
                       n_neighbors=cmd_args.n_neighbors).reconcile_models()

    time_per_cell = None
    prediction_dict = {}
    for cutoff in cmd_args.cutoff:
        prediction_dict[cutoff] = hits.filter(
            by=cmd_args.filter_by, cutoff=cutoff).annotate(
                cmd_args.annotation,
                min_hits=cmd_args.min_hits)[cmd_args.annotation]
        if time_per_cell is None:
            time_per_cell = (time.time() - start_time) * 1000 / len(
                prediction_dict[cutoff])
    print("Time per cell: %.3fms" % time_per_cell)

    cb.message.info("Saving result...")
    if os.path.exists(cmd_args.output):
        os.remove(cmd_args.output)
    for cutoff in prediction_dict:
        cb.data.write_hybrid_path(
            prediction_dict[cutoff],
            "%s//prediction/%s" % (cmd_args.output, str(cutoff)))
    cb.data.write_hybrid_path(time_per_cell, "//".join(
        (cmd_args.output, "time")))

Example #5

0

Show file

def main(cmd_args):
    dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input, sparsify=True)
    if cmd_args.clean is not None:
        dataset = utils.clean_dataset(dataset, cmd_args.clean)
    dataset = dataset.normalize()
    dataset = dataset[:, dataset.uns[cmd_args.genes]]
    dataset.exprs = np.log1p(dataset.exprs)
    if cmd_args.batch_effect is not None:
        batches = np.unique(dataset.obs[cmd_args.batch_effect])
        for batch in batches:
            dataset[dataset.obs[cmd_args.batch_effect] ==
                    batch, :].write_table(os.path.join(cmd_args.output_path,
                                                       "input",
                                                       f"{batch}.csv"),
                                          index=False)
    else:
        dataset.write_table(os.path.join(cmd_args.output_path, "input",
                                         "data.csv"),
                            index=False)

    call_args = [
        "python",
        os.path.join(SAUCIE.__path__[0], "SAUCIE.py"), "--input_dir",
        os.path.join(cmd_args.output_path, "input"), "--output_dir",
        os.path.join(cmd_args.output_path, "output"), "--seed",
        str(cmd_args.seed), "--cluster"
    ]
    if cmd_args.batch_effect is not None:
        call_args.append("--batch_correct")
    start_time = time.time()
    print(f"Running command: {' '.join(call_args)}")
    subprocess.check_call(call_args)
    cb.data.write_hybrid_path(time.time() - start_time,
                              "//".join([cmd_args.output, "time"]))

    if cmd_args.batch_effect is not None:
        latent = np.empty((dataset.shape[0], 2))
        for batch in batches:
            idx = np.where(dataset.obs[cmd_args.batch_effect] == batch)[0]
            latent[idx, :] = pd.read_csv(
                os.path.join(cmd_args.output_path, "output", "clustered",
                             f"{batch}.csv")
            ).loc[:, ["Embedding_SAUCIE1", "Embedding_SAUCIE2"]].to_numpy()
    else:
        latent = pd.read_csv(
            os.path.join(cmd_args.output_path, "output", "clustered",
                         "data.csv")
        ).loc[:, ["Embedding_SAUCIE1", "Embedding_SAUCIE2"]].to_numpy()
    cb.data.write_hybrid_path(latent, "//".join([cmd_args.output, "latent"]))

Example #6

0

Show file

File: visualize.py Project: wook2014/Cell_BLAST

def main():
    x = cb.data.read_hybrid_path("//".join(
        [snakemake.input.x, "visualization"]))
    ds = cb.data.ExprDataSet.read_dataset(snakemake.input.data)
    ds = utils.clean_dataset(ds, snakemake.config["label"])

    axis1 = "{vis}1".format(vis=snakemake.wildcards.vis)
    axis2 = "{vis}2".format(vis=snakemake.wildcards.vis)
    label = snakemake.wildcards.label.replace("_", " ").capitalize()

    df = pd.DataFrame({
        axis1:
        x[:, 0],
        axis2:
        x[:, 1],
        label:
        pd.Categorical(ds.obs[snakemake.wildcards.label].values,
                       categories=sorted(np.unique(
                           ds.obs[snakemake.wildcards.label].values).tolist(),
                                         key=lambda x: x.lower()))
    })
    if snakemake.params["shuffle"]:
        df = df.sample(frac=1)

    fig, ax = plt.subplots(figsize=(snakemake.params["width"],
                                    snakemake.params["height"]))
    ax = sns.scatterplot(x=axis1,
                         y=axis2,
                         hue=label,
                         data=df,
                         s=snakemake.params["psize"],
                         edgecolor=None,
                         rasterized=snakemake.params["rasterized"],
                         ax=ax)
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.yaxis.set_ticks_position("left")
    ax.xaxis.set_ticks_position("bottom")
    ax.legend(bbox_to_anchor=(1.05, 0.5),
              loc="center left",
              borderaxespad=0.0,
              frameon=False,
              prop=dict(size=snakemake.params["legend_size"]),
              markerscale=snakemake.params["marker_scale"],
              labelspacing=snakemake.params["label_spacing"],
              ncol=np.ceil(np.unique(df[label]).size / 50).astype(np.int))
    fig.savefig(snakemake.output[0], dpi=300, bbox_inches="tight")

Example #7

0

Show file

File: run_scANVI_query.py Project: wook2014/Cell_BLAST

def main(cmd_args):
    print("Loading model...")
    with open(os.path.join(cmd_args.model, "label_encoder.pickle"), "rb") as f:
        label_encoder = pickle.load(f)
    genes = np.loadtxt(os.path.join(cmd_args.model, "genes.txt"), dtype=str)
    vae = torch.load(os.path.join(cmd_args.model, "model.pickle"))

    print("Loading query...")
    query = cb.data.ExprDataSet.read_dataset(cmd_args.query, sparsify=True)
    if cmd_args.clean is not None:
        query = utils.clean_dataset(query, cmd_args.clean)
    n_cells = query.shape[0]
    if cmd_args.normalize:
        query = query.normalize()
    query = query[:, genes]
    query.to_anndata().write_h5ad(
        os.path.join(cmd_args.output_path, "query.h5ad"))
    query = scvi.dataset.AnnDataset("query.h5ad",
                                    save_path=cmd_args.output_path + "/")

    print("Predicting...")
    start_time = time.time()
    trainer = scvi.inference.annotation.CustomSemiSupervisedTrainer(
        vae, query, np.array([]), use_cuda=True, metrics_to_monitor=["ll"])
    prob = get_scanvi_class_posterior(trainer)

    time_per_cell = None
    prediction_dict = collections.defaultdict(
        lambda: np.repeat("rejected", n_cells).astype(object))
    for cutoff in cmd_args.cutoff:
        mask = prob.max(axis=1) > cutoff
        prediction_dict[cutoff][mask] = label_encoder.inverse_transform(
            prob[mask].argmax(axis=1))
        if time_per_cell is None:
            time_per_cell = (time.time() - start_time) * 1000 / n_cells
    print("Time per cell: %.3fms" % time_per_cell)

    print("Saving result...")
    if os.path.exists(cmd_args.output):
        os.remove(cmd_args.output)
    for cutoff, prediction in prediction_dict.items():
        cb.data.write_hybrid_path(
            prediction, "%s//prediction/%s" % (cmd_args.output, str(cutoff)))
    cb.data.write_hybrid_path(time_per_cell, "//".join(
        (cmd_args.output, "time")))

Example #8

0

Show file

def main(cmd_args):

    cb.message.info("Reading data...")
    dataset = cb.data.ExprDataSet.read_dataset(cmd_args.ref)
    if cmd_args.clean:
        dataset = utils.clean_dataset(dataset, cmd_args.clean)

    os.environ["CUDA_VISIBLE_DEVICES"] = str(utils.pick_gpu_lowest_memory()) \
        if cmd_args.device is None else cmd_args.device
    models = [cb.directi.DIRECTi.load(model) for model in cmd_args.models]

    cb.message.info("Building Cell BLAST index...")
    blast = cb.blast.BLAST(models, dataset, n_posterior=cmd_args.n_posterior)

    cb.message.info("Saving index...")
    blast.save(cmd_args.output_path)

    cb.message.info("Done!")

Example #9

0

Show file

File: run_scPhere.py Project: wook2014/Cell_BLAST

def main(cmd_args):
    dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input, sparsify=True)
    if cmd_args.clean is not None:
        dataset = utils.clean_dataset(dataset, cmd_args.clean)
    dataset = dataset[:, dataset.uns[cmd_args.genes]]
    dataset.exprs = cb.utils.densify(dataset.exprs)
    if cmd_args.batch_effect:
        batch_id = [
            cb.utils.encode_integer(
                dataset.obs[batch_effect].astype(object).fillna("NA"))[0]
            for batch_effect in cmd_args.batch_effect
        ]
        n_batch = [np.unique(item).size for item in batch_id]
        batch_id = np.stack(batch_id, axis=1)
        if len(cmd_args.batch_effect) == 1:
            n_batch = n_batch[0]
            batch_id = batch_id[:, 0]
    else:
        n_batch = 0
        batch_id = np.zeros(dataset.shape[0]) * -1

    start_time = time.time()
    model = scphere.model.vae.SCPHERE(n_gene=dataset.shape[1],
                                      z_dim=cmd_args.dim,
                                      latent_dist="vmf",
                                      observation_dist="nb",
                                      seed=cmd_args.seed,
                                      n_batch=n_batch)
    trainer = scphere.util.trainer.Trainer(x=dataset.exprs,
                                           model=model,
                                           mb_size=128,
                                           learning_rate=0.001,
                                           max_epoch=250,
                                           batch_id=batch_id)
    trainer.train()

    latent = model.encode(dataset.exprs, batch_id)
    cb.data.write_hybrid_path(time.time() - start_time,
                              "//".join([cmd_args.output, "time"]))
    cb.data.write_hybrid_path(latent, "//".join([cmd_args.output, "latent"]))

    model.save_sess(os.path.join(cmd_args.output_path, "model"))

Example #10

0

Show file

def main(cmd_args):
    dataset = cb.data.ExprDataSet.read_dataset(
        cmd_args.input, sparsify=True
    ).normalize()
    if cmd_args.clean is not None:
        dataset = utils.clean_dataset(dataset, cmd_args.clean)
    if cmd_args.genes is not None:
        dataset = dataset[:, dataset.uns[cmd_args.genes]]
    dataset = dataset.exprs.log1p().toarray()
    start_time = time.time()
    model = DeepImpute.train(
        dataset, cmd_args.n_latent,
        max_epoch=cmd_args.n_epochs, random_seed=cmd_args.seed
    )
    latent, _imputed_val, _batch_effect = DeepImpute.predict(dataset, model)
    cb.data.write_hybrid_path(
        time.time() - start_time,
        "//".join([cmd_args.output, "time"])
    )
    cb.data.write_hybrid_path(
        latent,
        "//".join([cmd_args.output, "latent"])
    )

Example #11

0

Show file

File: run_DCA_query.py Project: evenDDDDD/Cell_BLAST

def main(cmd_args):

    cb.message.info("Reading data...")
    genes = np.loadtxt(os.path.join(cmd_args.model, "genes.txt"), dtype=np.str)
    ref = cb.data.ExprDataSet.read_dataset(cmd_args.ref)
    ref = utils.clean_dataset(
        ref,
        cmd_args.clean).to_anndata() if cmd_args.clean else ref.to_anndata()
    ref = ref[np.random.RandomState(cmd_args.seed).
              choice(ref.shape[0], cmd_args.subsample_ref, replace=False
                     ), :] if cmd_args.subsample_ref is not None else ref
    ref_label = ref.obs[cmd_args.annotation].values
    ref = dca_modpp.io.normalize(ref,
                                 genes,
                                 filter_min_counts=False,
                                 size_factors=10000,
                                 normalize_input=False,
                                 logtrans_input=True)
    cb.message.info("Loading model...")
    os.environ["CUDA_VISIBLE_DEVICES"] = utils.pick_gpu_lowest_memory() \
        if cmd_args.device is None else cmd_args.device
    model = keras.models.load_model(os.path.join(cmd_args.model, "model.h5"))

    cb.message.info("Projecting to latent space...")
    ref_latent = model.predict({
        "count": ref.X,
        "size_factors": ref.obs.size_factors
    })
    nn = sklearn.neighbors.NearestNeighbors().fit(ref_latent)

    cb.message.info("Building empirical distribution...")
    np.random.seed(cmd_args.seed)
    idx1 = np.random.choice(ref_latent.shape[0], size=N_EMPIRICAL)
    idx2 = np.random.choice(ref_latent.shape[0], size=N_EMPIRICAL)
    empirical = np.sort(
        np.sqrt(np.sum(np.square(ref_latent[idx1] - ref_latent[idx2]),
                       axis=1)))

    cb.message.info("Querying...")
    query = cb.data.ExprDataSet.read_dataset(cmd_args.query)
    query = query[:, np.union1d(query.var_names, genes)]
    query = utils.clean_dataset(
        query,
        cmd_args.clean).to_anndata() if cmd_args.clean else query.to_anndata()
    start_time = time.time()
    query = dca_modpp.io.normalize(query,
                                   genes,
                                   filter_min_counts=False,
                                   size_factors=10000,
                                   normalize_input=False,
                                   logtrans_input=True)
    query_latent = model.predict({
        "count": query.X,
        "size_factors": query.obs.size_factors
    })
    nnd, nni = nn.kneighbors(query_latent, n_neighbors=cmd_args.n_neighbors)
    pval = np.empty_like(nnd, np.float32)
    time_per_cell = None
    prediction_dict = collections.defaultdict(list)

    for cutoff in cmd_args.cutoff:
        for i in range(nnd.shape[0]):
            for j in range(nnd.shape[1]):
                pval[i, j] = np.searchsorted(empirical,
                                             nnd[i, j]) / empirical.size
            uni, count = np.unique(ref_label[nni[i][pval[i] < cutoff]],
                                   return_counts=True)
            total_count = count.sum()
            if total_count < cmd_args.min_hits:
                prediction_dict[cutoff].append("rejected")
                continue
            argmax = np.argmax(count)
            if count[argmax] / total_count <= MAJORITY_THRESHOLD:
                prediction_dict[cutoff].append("ambiguous")
                continue
            prediction_dict[cutoff].append(uni[argmax])
        prediction_dict[cutoff] = np.array(prediction_dict[cutoff])
        if time_per_cell is None:
            time_per_cell = (time.time() - start_time) * 1000 / len(
                prediction_dict[cutoff])
    print("Time per cell: %.3fms" % time_per_cell)

    cb.message.info("Saving results...")
    if os.path.exists(cmd_args.output):
        os.remove(cmd_args.output)
    for cutoff in prediction_dict:
        cb.data.write_hybrid_path(
            prediction_dict[cutoff],
            "%s//prediction/%s" % (cmd_args.output, str(cutoff)))
    cb.data.write_hybrid_path(nni, "//".join((cmd_args.output, "nni")))
    cb.data.write_hybrid_path(nnd, "//".join((cmd_args.output, "nnd")))
    cb.data.write_hybrid_path(pval, "//".join((cmd_args.output, "pval")))
    cb.data.write_hybrid_path(time_per_cell, "//".join(
        (cmd_args.output, "time")))

Example #12

0

Show file

File: arxiv.py Project: muskanmahajan37/struct

    config = logging_confdict(working_folder, __name__ + "_cleanup")
    logging.config.dictConfig(config)
    arxiv_logger = logging.getLogger(__name__ + "_cleanup")

    # Read in stage_1 raw file
    try:
        stage_1_raw = pd.read_json(working_folder + "/stage_1_raw.json")
    except Exception, e:
        arxiv_logger.exception("Could not load stage_1_raw file. Exiting...")
        sys.exit("Could not load stage_1_raw file")
    else:
        arxiv_logger.info("Stage_1_raw successfully loaded")

    if not remove_columns:
        remove_columns = eval(Config.get('data_settings', 'remove_cols'))
    stage_1 = clean_dataset(stage_1_raw, arxiv_logger, earliest_date,
                            latest_date, remove_columns)

    stage_1['submitted'] = pd.to_datetime(stage_1['submitted'], unit="ms")
    arxiv_ids = []
    for original_arxiv in stage_1['id'].values:
        found_regex = regex_new_arxiv.findall(original_arxiv)
        if found_regex:
            arxiv_id = found_regex[0]
        else:
            found_regex = regex_old_arxiv.findall(original_arxiv)
            if found_regex:
                arxiv_id = found_regex[0]
            else:
                arxiv_id = "parse_failed"
        arxiv_ids.append(arxiv_id)
    stage_1['arxiv_id'] = pd.Series(arxiv_ids, index=stage_1.index)

Example #13

0

Show file

File: run_Null.py Project: wook2014/Cell_BLAST

# by caozj
# Jan 23, 2020
# 11:40:37 AM

import argparse
import numpy as np
import Cell_BLAST as cb
import utils

# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", dest="input", type=str, required=True)
parser.add_argument("-o", "--output", dest="output", type=str, required=True)
parser.add_argument("-g", "--genes", dest="genes", type=str, default=None)
parser.add_argument("--clean", dest="clean", type=str, default=None)
cmd_args = parser.parse_args()

# Read data
print("Reading data...")
x = cb.data.ExprDataSet.read_dataset(cmd_args.input).normalize()
if cmd_args.clean:
    x = utils.clean_dataset(x, cmd_args.clean)
if cmd_args.genes is not None:
    x = cb.utils.densify(np.log1p(x[:, x.uns[cmd_args.genes]].exprs))

# Save result
cb.data.write_hybrid_path(x, "%s//exprs" % cmd_args.output)
cb.data.write_hybrid_path(0, "%s//time" % cmd_args.output)

print("Done!")

Example #14

0

Show file

def main(cmd_args):
    cb.message.info("Reading data...")
    dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input)
    if not cmd_args.no_normalize:
        dataset = dataset.normalize()
    if cmd_args.clean:
        dataset = utils.clean_dataset(dataset, cmd_args.clean)

    if cmd_args.supervision is not None and cmd_args.label_fraction is not None:
        label = dataset.obs[cmd_args.supervision]
        if cmd_args.label_priority is not None:
            label_priority = dataset.obs[cmd_args.label_priority].values
        else:
            _label_priority = np.random.uniform(size=label.shape[0])
            label_priority = np.empty(len(_label_priority))
            for l in np.unique(label):  # Group percentile
                mask = label == l
                label_priority[mask] = (scipy.stats.rankdata(
                    _label_priority[mask]) - 1) / (mask.sum() - 1)
        exclude_mask = label_priority < np.percentile(
            label_priority, (1 - cmd_args.label_fraction) * 100)
        dataset.obs.loc[exclude_mask, cmd_args.supervision] = np.nan

    latent_module_kwargs = dict(lambda_reg=cmd_args.lambda_prior_reg)
    if cmd_args.supervision is not None:
        latent_module_kwargs["lambda_sup"] = cmd_args.lambda_sup
    prob_module_kwargs = dict(lambda_reg=cmd_args.lambda_prob_reg)
    rmbatch_module_kwargs = dict(lambda_reg=cmd_args.lambda_rmbatch_reg)

    os.environ["CUDA_VISIBLE_DEVICES"] = utils.pick_gpu_lowest_memory() \
        if cmd_args.device is None else cmd_args.device
    start_time = time.time()
    model = cb.directi.fit_DIRECTi(
        dataset,
        genes=None if cmd_args.genes is None else dataset.uns[cmd_args.genes],
        latent_dim=cmd_args.latent_dim,
        cat_dim=cmd_args.cat_dim,
        supervision=cmd_args.supervision,
        batch_effect=cmd_args.batch_effect,
        h_dim=cmd_args.h_dim,
        depth=cmd_args.depth,
        prob_module=cmd_args.prob_module,
        rmbatch_module=cmd_args.rmbatch_module,
        latent_module_kwargs=latent_module_kwargs,
        prob_module_kwargs=prob_module_kwargs,
        rmbatch_module_kwargs=rmbatch_module_kwargs,
        optimizer=cmd_args.optimizer,
        learning_rate=cmd_args.learning_rate,
        batch_size=cmd_args.batch_size,
        val_split=cmd_args.val_split,
        epoch=cmd_args.epoch,
        patience=cmd_args.patience,
        progress_bar=True,
        random_seed=cmd_args.seed,
        path=cmd_args.output_path)
    model.save()

    cb.message.info("Saving results...")
    inferred_latent = model.inference(dataset)
    cb.data.write_hybrid_path(time.time() - start_time,
                              "%s//time" % cmd_args.output)
    if "exclude_mask" in globals():
        cb.data.write_hybrid_path(~exclude_mask,
                                  "%s//supervision" % cmd_args.output)
    cb.data.write_hybrid_path(inferred_latent, "%s//latent" % cmd_args.output)
    try:  # If intrinsic clustering is used
        cb.data.write_hybrid_path(
            model.clustering(dataset)[0], "%s//cluster" % cmd_args.output)
    except Exception:
        pass

Example #15

0

Show file

File: normal_solve.py Project: zhongyuchen/PRML-Spring19-Fudan

observe_dif_times = args.observe_loss_sequence_length
terminate_threshold = args.terminate_threshold


np.random.seed(2019)


if __name__ == '__main__':
    dataset_train, dataset_test = handout.get_text_classification_datasets()
    categories = dataset_train.target_names
     
    # training data and labels
    training_data = (dataset_train.data)
    training_labels = np.array((dataset_train.target))
    
    clean_training_data = utils.clean_dataset(training_data)
    mapping_dict = utils.build_mapping_dict(clean_training_data)
    feature_vector = utils.data2vec(clean_training_data, mapping_dict)
    print(len(feature_vector[0]))
    
    # build model
    softmax_model = model.Softmax_CrossEntropy_model(class_num=len(categories),
                                                     feature_length=feature_vector.shape[1],
                                                     learning_rate=learning_rate,
                                                     regularization_rate=regularization_rate)
    present_epoch = 0
    example_num = len(feature_vector)
    step = 0
    
    # initial auto_terminate
    import queue

Example #16

0

Show file

File: compute_answers.py Project: CorentinMAG/BIDAF

def main():
	parser = argparse.ArgumentParser(description = 'BIDAF')
	parser.add_argument('file', type = str, help = 'the test file')
	parser.add_argument('--question_maxlen', default = 25, type = int)
	parser.add_argument('--context_maxlen', default = 400, type = int)
	parser.add_argument('--word_maxlen', default = 15, type = int)
	parser.add_argument('--batch_size', default = 10, type = int)
	parser.add_argument('--word_tokenizer', default = 'utils/tokenizers/word_tokenizer.pkl', type = str, help = 'path to the word_tokenizer')
	parser.add_argument('--char_tokenizer', default = 'utils/tokenizers/char_tokenizer.pkl', type = str, help = 'path to the char_tokenizer')
	parser.add_argument('--output_file', default = 'predictions.json', type = str, help = 'path to the output file')
	parser.add_argument('--weights', default = 'utils/models/weights/bidaf_weights', type = str, help = 'path to the weights')
	parser.add_argument('--embedding_size', default = 300, type = int)
	parser.add_argument('--embedding_matrix', default = 'utils/data/embedding.npy', type = str, help = 'path to the embedding matrix npy file')
	parser.add_argument('--learning_rate', default = 0.0005, type = float)
	parser.add_argument('--filter_size', default = 3, type = int)
	parser.add_argument('--char_embedding_size', default = 8, type = int)
	parser.add_argument('--epochs', default = 10, type = int)
	args = parser.parse_args()

	QUESTION_MAXLEN = args.question_maxlen
	CONTEXT_MAXLEN = args.context_maxlen
	WORD_MAXLEN = args.word_maxlen
	BATCH_SIZE = args.batch_size
	LR = args.learning_rate
	EMBEDDING_SIZE = args.embedding_size
	N_FILTERS = EMBEDDING_SIZE
	CHAR_EMBEDDING_SIZE = args.char_embedding_size
	EPOCHS = args.epochs
	FILTER_SIZE = args.filter_size

	curr = os.getcwd()
	
	filepath = os.path.join(curr, args.file)
	output_path = os.path.join(curr, args.output_file)
	word_tokenizer_path = os.path.join(curr, args.word_tokenizer)
	char_tokenizer_path = os.path.join(curr, args.char_tokenizer)
	weights_path = os.path.join(curr, args.weights)
	embedding_matrix_path = os.path.join(curr, args.embedding_matrix)

	with open(word_tokenizer_path, 'rb') as word_handle:
		word_tokenizer = pickle.load(word_handle)

	with open(char_tokenizer_path, 'rb') as char_handle:
		char_tokenizer = pickle.load(char_handle)

	embedding_matrix = np.load(embedding_matrix_path)

	WORD_VOCAB_LEN = len(word_tokenizer.word_index) + 1
	CHAR_VOCAB_LEN = char_tokenizer.num_words

	dataset = load_dataset(filepath, with_answer = False)
	SAMPLES = dataset.shape[0]

	print('[INFO] cleaning data...')
	dataset = clean_dataset(dataset, with_answer = False)
	print('[INFO] done !')
	print('[INFO] tokenizing data...')
	dataset = tokenize(dataset, word_tokenizer, char_tokenizer)
	print('[INFO] done !')
	dataset = dataset[(dataset['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (dataset['tokenized_context'].str.len() <= CONTEXT_MAXLEN)].reset_index(drop = True) 
	print(f'[PREPROCESSING] we get rid of : {SAMPLES - dataset.shape[0]} samples')

	dataset = SQUAD_dataset(dataset, 
		batch_size = BATCH_SIZE, 
		question_maxlen = QUESTION_MAXLEN,
		context_maxlen = CONTEXT_MAXLEN, 
		word_maxlen = WORD_MAXLEN, 
		with_answer = False)

	bidaf_model = BIDAF(
		QUESTION_MAXLEN,
		CONTEXT_MAXLEN,
		WORD_VOCAB_LEN,
		EMBEDDING_SIZE,
		embedding_matrix,
		CHAR_VOCAB_LEN,
		WORD_MAXLEN,
		N_FILTERS,
		FILTER_SIZE,
		CHAR_EMBEDDING_SIZE,
		word_tokenizer_path,
		char_tokenizer_path)

	bidaf_model.load_weights(weights_path)
	print('[INFO] making predictions...')
	bidaf_model.multi_predictions([dataset], output_path)

Example #17

0

Show file

    config = logging_confdict(working_folder, __name__ + "_cleanup")
    logging.config.dictConfig(config)
    cr_logger = logging.getLogger(__name__ + "_cleanup")

    # Read in stage_1 raw file
    try:
        stage_2_raw = pd.read_json(working_folder + "/stage_2_raw.json")
    except Exception, e:
        cr_logger.exception("Could not load stage_1_raw file")
        sys.exit("Could not load stage 2 raw")
    else:
        cr_logger.info("Stage_1_raw successfully loaded")

    if not remove_columns:
        remove_columns = eval(Config.get('data_settings', 'remove_cols'))
    stage_2 = clean_dataset(stage_2_raw, cr_logger, earliest_date, latest_date,
                            remove_columns)

    cr_unique_dois = stage_2.cr_doi.unique()
    arxiv_unique_dois = stage_2.doi.unique()
    common = set(cr_unique_dois) & set(arxiv_unique_dois)

    cr_logger.info("cr:{}, arxiv:{}, common:{}".format(len(cr_unique_dois),
                                                       len(arxiv_unique_dois),
                                                       len(common)))

    stage_2_no_nan = stage_2[[elem is not np.nan for elem in stage_2.cr_doi]]
    multiple_dois_bool = stage_2_no_nan.cr_doi.duplicated()
    multiple_dois = stage_2_no_nan[multiple_dois_bool].cr_doi

    bad_indices = []
    good_indices = []

Example #18

0

Show file

def main(cmd_args):
    dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input, sparsify=True)
    if cmd_args.clean is not None:
        dataset = utils.clean_dataset(dataset, cmd_args.clean)
    if cmd_args.genes is not None:
        dataset = dataset[:, dataset.uns[cmd_args.genes]]
    if cmd_args.batch_effect is not None:
        batch_indices = sklearn.preprocessing.LabelEncoder().fit_transform(
            dataset.obs[cmd_args.batch_effect])
    if cmd_args.supervision is not None:
        labels = sklearn.preprocessing.LabelEncoder().fit_transform(
            dataset.obs[cmd_args.supervision])
        if cmd_args.label_fraction is not None:
            if cmd_args.label_priority is not None:
                label_priority = dataset.obs[cmd_args.label_priority]
            else:
                _label_priority = np.random.uniform(size=labels.size)
                label_priority = np.empty(len(_label_priority))
                for l in np.unique(labels):  # Group percentile
                    mask = labels == l
                    label_priority[mask] = (
                        scipy.stats.rankdata(_label_priority[mask]) - 1
                    ) / (mask.sum() - 1)
            if cmd_args.label_fraction == 1.0:
                # Remove a small number of labelled cells to avoid empty
                # unlabelled set, which will lead to a crash.
                cmd_args.label_fraction = 0.99
            labelled_indices = np.where(label_priority >= np.percentile(
                label_priority, (1 - cmd_args.label_fraction) * 100
            ))[0]
        else:
            labelled_indices = np.arange(labels.size)
    dataset.to_anndata().write_h5ad(os.path.join(cmd_args.output_path, "data.h5ad"))
    dataset = scvi.dataset.AnnDataset("data.h5ad", save_path=cmd_args.output_path + "/")

    start_time = time.time()
    model_kwargs = dict(
        n_latent=cmd_args.n_latent,
        n_hidden=cmd_args.n_hidden,
        n_layers=cmd_args.n_layers
    )
    trainer_kwargs = dict(
        use_cuda=True, metrics_to_monitor=["ll"], frequency=5,
        early_stopping_kwargs=dict(
            early_stopping_metric="ll", save_best_state_metric="ll",
            patience=cmd_args.patience, threshold=0
        )
    )
    if cmd_args.batch_effect is not None:
        dataset.batch_indices, dataset.n_batches = \
            batch_indices.reshape((-1, 1)), np.unique(batch_indices).size
        model_kwargs["n_batch"] = dataset.n_batches
    if cmd_args.supervision is not None:
        print("Using SCANVI...")
        dataset.labels, dataset.n_labels = \
            labels.reshape((-1, 1)), np.unique(labels).size
        vae = scvi.models.SCANVI(
            dataset.nb_genes, n_labels=dataset.n_labels, **model_kwargs)
        # trainer_kwargs["early_stopping_kwargs"]["on"] = "unlabelled_set"
        trainer = scvi.inference.annotation.CustomSemiSupervisedTrainer(
            vae, dataset, labelled_indices, **trainer_kwargs)
    else:
        print("Using VAE...")
        vae = scvi.models.VAE(dataset.nb_genes, **model_kwargs)
        trainer = scvi.inference.UnsupervisedTrainer(
            vae, dataset, **trainer_kwargs)
    trainer.train(n_epochs=cmd_args.n_epochs, lr=cmd_args.lr)
    cb.data.write_hybrid_path(
        time.time() - start_time,
        "//".join([cmd_args.output, "time"])
    )
    latent = trainer.get_all_latent_and_imputed_values()["latent"]
    cb.data.write_hybrid_path(latent, "//".join([cmd_args.output, "latent"]))

    torch.save(vae, os.path.join(cmd_args.output_path, "model.pickle"))
    os.remove(os.path.join(cmd_args.output_path, "data.h5ad"))