def main(cmd_args): dataset = cb.data.ExprDataSet.read_dataset( cmd_args.input, sparsify=True).normalize( target=100000) # Example data seem to be normalized to 100,000 if cmd_args.clean: dataset = utils.clean_dataset(dataset, cmd_args.clean) if cmd_args.genes is not None: dataset = dataset[:, dataset.uns[cmd_args.genes]] dataset = np.log2(dataset.exprs.toarray() + 1) mat_file = os.path.join(cmd_args.output_path, "matrix.txt.gz") res_file = os.path.join(cmd_args.output_path, "output_datafile") np.savetxt(mat_file, dataset) start_time = time.time() Dhaka.Dhaka(mat_file, latent_dim=cmd_args.n_latent, N_starts=1, epochs=cmd_args.n_epochs, output_datafile=res_file, to_cluster=0, gene_selection=0, to_plot=0, relative_expression=0) cb.data.write_hybrid_path(time.time() - start_time, "//".join([cmd_args.output, "time"])) cb.data.write_hybrid_path(np.loadtxt(res_file + ".txt"), "//".join([cmd_args.output, "latent"])) os.remove(mat_file) os.remove(res_file + ".txt")
def main(cmd_args): cb.utils.logger.info("Reading data...") dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input) if cmd_args.clean: dataset = utils.clean_dataset(dataset, cmd_args.clean) model = cb.directi.DIRECTi.load(cmd_args.model) data_dict = { "exprs": dataset[:, model.genes].exprs, "library_size": np.array(dataset.exprs.sum(axis=1)).reshape((-1, 1)) } start_time = time.time() if cmd_args.target == "zeros": data_dict[cmd_args.batch_effect] = np.zeros( (dataset.shape[0], np.unique(dataset.obs[cmd_args.batch_effect]).size)) elif cmd_args.target == "first": data_dict[cmd_args.batch_effect] = cb.utils.encode_onehot( dataset.obs["dataset_name"].astype(object).fillna("IgNoRe"), sort=True, ignore="IgNoRe").toarray() data_dict[cmd_args.batch_effect][:, 0] = 1.0 data_dict[cmd_args.batch_effect][:, 1:] = 0.0 else: # cmd_args.target == "ones": data_dict[cmd_args.batch_effect] = np.ones( (dataset.shape[0], np.unique(dataset.obs[cmd_args.batch_effect]).size)) corrected = model._fetch(model.prob_module.softmax_mu, cb.utils.DataDict(data_dict)) cb.data.write_hybrid_path(time.time() - start_time, f"{cmd_args.output}//time") cb.data.write_hybrid_path(corrected, f"{cmd_args.output}//exprs")
def main(cmd_args): dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input, sparsify=True) if cmd_args.clean is not None: dataset = utils.clean_dataset(dataset, cmd_args.clean) if cmd_args.genes is not None: genes = dataset.uns[cmd_args.genes] else: genes = None dataset = dataset.to_anndata() start_time = time.time() dataset, model = dca_modpp.api.dca( dataset, genes, mode="latent", normalize_per_cell=10000, scale=False, hidden_size=(cmd_args.n_hidden, ) * cmd_args.n_layers + (cmd_args.n_latent, ) + (cmd_args.n_hidden, ) * cmd_args.n_layers, epochs=cmd_args.n_epochs, early_stop=cmd_args.patience, random_state=cmd_args.seed, threads=cmd_args.threads, return_model=True, copy=True) cb.data.write_hybrid_path(time.time() - start_time, "//".join([cmd_args.output, "time"])) cb.data.write_hybrid_path(dataset.obsm["X_dca"], "//".join([cmd_args.output, "latent"])) model.encoder.save(os.path.join(cmd_args.output_path, "model.h5")) np.savetxt(os.path.join(cmd_args.output_path, "genes.txt"), genes, "%s")
def main(cmd_args): cb.message.info("Loading index...") os.environ["CUDA_VISIBLE_DEVICES"] = utils.pick_gpu_lowest_memory() \ if cmd_args.device is None else cmd_args.device blast = cb.blast.BLAST.load(cmd_args.index) if cmd_args.subsample_ref is not None: cb.message.info("Subsampling reference...") subsample_idx = np.random.RandomState(cmd_args.seed).choice( blast.ref.shape[0], cmd_args.subsample_ref, replace=False) blast.ref = blast.ref[subsample_idx, :] blast.latent = blast.latent[ subsample_idx] if blast.latent is not None else None blast.cluster = blast.cluster[ subsample_idx] if blast.cluster is not None else None blast.posterior = blast.posterior[ subsample_idx] if blast.posterior is not None else None blast.nearest_neighbors = None blast.empirical = None blast._force_components() cb.message.info("Reading query...") query = cb.data.ExprDataSet.read_dataset(cmd_args.query) if cmd_args.clean: query = utils.clean_dataset(query, cmd_args.clean) if cmd_args.align: cb.message.info("Aligning...") unipath = "/tmp/cb/" + cb.utils.rand_hex() cb.message.info("Using temporary path: " + unipath) blast = blast.align(query, path=unipath) cb.message.info("BLASTing...") start_time = time.time() hits = blast.query(query, n_neighbors=cmd_args.n_neighbors).reconcile_models() time_per_cell = None prediction_dict = {} for cutoff in cmd_args.cutoff: prediction_dict[cutoff] = hits.filter( by=cmd_args.filter_by, cutoff=cutoff).annotate( cmd_args.annotation, min_hits=cmd_args.min_hits)[cmd_args.annotation] if time_per_cell is None: time_per_cell = (time.time() - start_time) * 1000 / len( prediction_dict[cutoff]) print("Time per cell: %.3fms" % time_per_cell) cb.message.info("Saving result...") if os.path.exists(cmd_args.output): os.remove(cmd_args.output) for cutoff in prediction_dict: cb.data.write_hybrid_path( prediction_dict[cutoff], "%s//prediction/%s" % (cmd_args.output, str(cutoff))) cb.data.write_hybrid_path(time_per_cell, "//".join( (cmd_args.output, "time")))
def main(cmd_args): dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input, sparsify=True) if cmd_args.clean is not None: dataset = utils.clean_dataset(dataset, cmd_args.clean) dataset = dataset.normalize() dataset = dataset[:, dataset.uns[cmd_args.genes]] dataset.exprs = np.log1p(dataset.exprs) if cmd_args.batch_effect is not None: batches = np.unique(dataset.obs[cmd_args.batch_effect]) for batch in batches: dataset[dataset.obs[cmd_args.batch_effect] == batch, :].write_table(os.path.join(cmd_args.output_path, "input", f"{batch}.csv"), index=False) else: dataset.write_table(os.path.join(cmd_args.output_path, "input", "data.csv"), index=False) call_args = [ "python", os.path.join(SAUCIE.__path__[0], "SAUCIE.py"), "--input_dir", os.path.join(cmd_args.output_path, "input"), "--output_dir", os.path.join(cmd_args.output_path, "output"), "--seed", str(cmd_args.seed), "--cluster" ] if cmd_args.batch_effect is not None: call_args.append("--batch_correct") start_time = time.time() print(f"Running command: {' '.join(call_args)}") subprocess.check_call(call_args) cb.data.write_hybrid_path(time.time() - start_time, "//".join([cmd_args.output, "time"])) if cmd_args.batch_effect is not None: latent = np.empty((dataset.shape[0], 2)) for batch in batches: idx = np.where(dataset.obs[cmd_args.batch_effect] == batch)[0] latent[idx, :] = pd.read_csv( os.path.join(cmd_args.output_path, "output", "clustered", f"{batch}.csv") ).loc[:, ["Embedding_SAUCIE1", "Embedding_SAUCIE2"]].to_numpy() else: latent = pd.read_csv( os.path.join(cmd_args.output_path, "output", "clustered", "data.csv") ).loc[:, ["Embedding_SAUCIE1", "Embedding_SAUCIE2"]].to_numpy() cb.data.write_hybrid_path(latent, "//".join([cmd_args.output, "latent"]))
def main(): x = cb.data.read_hybrid_path("//".join( [snakemake.input.x, "visualization"])) ds = cb.data.ExprDataSet.read_dataset(snakemake.input.data) ds = utils.clean_dataset(ds, snakemake.config["label"]) axis1 = "{vis}1".format(vis=snakemake.wildcards.vis) axis2 = "{vis}2".format(vis=snakemake.wildcards.vis) label = snakemake.wildcards.label.replace("_", " ").capitalize() df = pd.DataFrame({ axis1: x[:, 0], axis2: x[:, 1], label: pd.Categorical(ds.obs[snakemake.wildcards.label].values, categories=sorted(np.unique( ds.obs[snakemake.wildcards.label].values).tolist(), key=lambda x: x.lower())) }) if snakemake.params["shuffle"]: df = df.sample(frac=1) fig, ax = plt.subplots(figsize=(snakemake.params["width"], snakemake.params["height"])) ax = sns.scatterplot(x=axis1, y=axis2, hue=label, data=df, s=snakemake.params["psize"], edgecolor=None, rasterized=snakemake.params["rasterized"], ax=ax) ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) ax.yaxis.set_ticks_position("left") ax.xaxis.set_ticks_position("bottom") ax.legend(bbox_to_anchor=(1.05, 0.5), loc="center left", borderaxespad=0.0, frameon=False, prop=dict(size=snakemake.params["legend_size"]), markerscale=snakemake.params["marker_scale"], labelspacing=snakemake.params["label_spacing"], ncol=np.ceil(np.unique(df[label]).size / 50).astype(np.int)) fig.savefig(snakemake.output[0], dpi=300, bbox_inches="tight")
def main(cmd_args): print("Loading model...") with open(os.path.join(cmd_args.model, "label_encoder.pickle"), "rb") as f: label_encoder = pickle.load(f) genes = np.loadtxt(os.path.join(cmd_args.model, "genes.txt"), dtype=str) vae = torch.load(os.path.join(cmd_args.model, "model.pickle")) print("Loading query...") query = cb.data.ExprDataSet.read_dataset(cmd_args.query, sparsify=True) if cmd_args.clean is not None: query = utils.clean_dataset(query, cmd_args.clean) n_cells = query.shape[0] if cmd_args.normalize: query = query.normalize() query = query[:, genes] query.to_anndata().write_h5ad( os.path.join(cmd_args.output_path, "query.h5ad")) query = scvi.dataset.AnnDataset("query.h5ad", save_path=cmd_args.output_path + "/") print("Predicting...") start_time = time.time() trainer = scvi.inference.annotation.CustomSemiSupervisedTrainer( vae, query, np.array([]), use_cuda=True, metrics_to_monitor=["ll"]) prob = get_scanvi_class_posterior(trainer) time_per_cell = None prediction_dict = collections.defaultdict( lambda: np.repeat("rejected", n_cells).astype(object)) for cutoff in cmd_args.cutoff: mask = prob.max(axis=1) > cutoff prediction_dict[cutoff][mask] = label_encoder.inverse_transform( prob[mask].argmax(axis=1)) if time_per_cell is None: time_per_cell = (time.time() - start_time) * 1000 / n_cells print("Time per cell: %.3fms" % time_per_cell) print("Saving result...") if os.path.exists(cmd_args.output): os.remove(cmd_args.output) for cutoff, prediction in prediction_dict.items(): cb.data.write_hybrid_path( prediction, "%s//prediction/%s" % (cmd_args.output, str(cutoff))) cb.data.write_hybrid_path(time_per_cell, "//".join( (cmd_args.output, "time")))
def main(cmd_args): cb.message.info("Reading data...") dataset = cb.data.ExprDataSet.read_dataset(cmd_args.ref) if cmd_args.clean: dataset = utils.clean_dataset(dataset, cmd_args.clean) os.environ["CUDA_VISIBLE_DEVICES"] = str(utils.pick_gpu_lowest_memory()) \ if cmd_args.device is None else cmd_args.device models = [cb.directi.DIRECTi.load(model) for model in cmd_args.models] cb.message.info("Building Cell BLAST index...") blast = cb.blast.BLAST(models, dataset, n_posterior=cmd_args.n_posterior) cb.message.info("Saving index...") blast.save(cmd_args.output_path) cb.message.info("Done!")
def main(cmd_args): dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input, sparsify=True) if cmd_args.clean is not None: dataset = utils.clean_dataset(dataset, cmd_args.clean) dataset = dataset[:, dataset.uns[cmd_args.genes]] dataset.exprs = cb.utils.densify(dataset.exprs) if cmd_args.batch_effect: batch_id = [ cb.utils.encode_integer( dataset.obs[batch_effect].astype(object).fillna("NA"))[0] for batch_effect in cmd_args.batch_effect ] n_batch = [np.unique(item).size for item in batch_id] batch_id = np.stack(batch_id, axis=1) if len(cmd_args.batch_effect) == 1: n_batch = n_batch[0] batch_id = batch_id[:, 0] else: n_batch = 0 batch_id = np.zeros(dataset.shape[0]) * -1 start_time = time.time() model = scphere.model.vae.SCPHERE(n_gene=dataset.shape[1], z_dim=cmd_args.dim, latent_dist="vmf", observation_dist="nb", seed=cmd_args.seed, n_batch=n_batch) trainer = scphere.util.trainer.Trainer(x=dataset.exprs, model=model, mb_size=128, learning_rate=0.001, max_epoch=250, batch_id=batch_id) trainer.train() latent = model.encode(dataset.exprs, batch_id) cb.data.write_hybrid_path(time.time() - start_time, "//".join([cmd_args.output, "time"])) cb.data.write_hybrid_path(latent, "//".join([cmd_args.output, "latent"])) model.save_sess(os.path.join(cmd_args.output_path, "model"))
def main(cmd_args): dataset = cb.data.ExprDataSet.read_dataset( cmd_args.input, sparsify=True ).normalize() if cmd_args.clean is not None: dataset = utils.clean_dataset(dataset, cmd_args.clean) if cmd_args.genes is not None: dataset = dataset[:, dataset.uns[cmd_args.genes]] dataset = dataset.exprs.log1p().toarray() start_time = time.time() model = DeepImpute.train( dataset, cmd_args.n_latent, max_epoch=cmd_args.n_epochs, random_seed=cmd_args.seed ) latent, _imputed_val, _batch_effect = DeepImpute.predict(dataset, model) cb.data.write_hybrid_path( time.time() - start_time, "//".join([cmd_args.output, "time"]) ) cb.data.write_hybrid_path( latent, "//".join([cmd_args.output, "latent"]) )
def main(cmd_args): cb.message.info("Reading data...") genes = np.loadtxt(os.path.join(cmd_args.model, "genes.txt"), dtype=np.str) ref = cb.data.ExprDataSet.read_dataset(cmd_args.ref) ref = utils.clean_dataset( ref, cmd_args.clean).to_anndata() if cmd_args.clean else ref.to_anndata() ref = ref[np.random.RandomState(cmd_args.seed). choice(ref.shape[0], cmd_args.subsample_ref, replace=False ), :] if cmd_args.subsample_ref is not None else ref ref_label = ref.obs[cmd_args.annotation].values ref = dca_modpp.io.normalize(ref, genes, filter_min_counts=False, size_factors=10000, normalize_input=False, logtrans_input=True) cb.message.info("Loading model...") os.environ["CUDA_VISIBLE_DEVICES"] = utils.pick_gpu_lowest_memory() \ if cmd_args.device is None else cmd_args.device model = keras.models.load_model(os.path.join(cmd_args.model, "model.h5")) cb.message.info("Projecting to latent space...") ref_latent = model.predict({ "count": ref.X, "size_factors": ref.obs.size_factors }) nn = sklearn.neighbors.NearestNeighbors().fit(ref_latent) cb.message.info("Building empirical distribution...") np.random.seed(cmd_args.seed) idx1 = np.random.choice(ref_latent.shape[0], size=N_EMPIRICAL) idx2 = np.random.choice(ref_latent.shape[0], size=N_EMPIRICAL) empirical = np.sort( np.sqrt(np.sum(np.square(ref_latent[idx1] - ref_latent[idx2]), axis=1))) cb.message.info("Querying...") query = cb.data.ExprDataSet.read_dataset(cmd_args.query) query = query[:, np.union1d(query.var_names, genes)] query = utils.clean_dataset( query, cmd_args.clean).to_anndata() if cmd_args.clean else query.to_anndata() start_time = time.time() query = dca_modpp.io.normalize(query, genes, filter_min_counts=False, size_factors=10000, normalize_input=False, logtrans_input=True) query_latent = model.predict({ "count": query.X, "size_factors": query.obs.size_factors }) nnd, nni = nn.kneighbors(query_latent, n_neighbors=cmd_args.n_neighbors) pval = np.empty_like(nnd, np.float32) time_per_cell = None prediction_dict = collections.defaultdict(list) for cutoff in cmd_args.cutoff: for i in range(nnd.shape[0]): for j in range(nnd.shape[1]): pval[i, j] = np.searchsorted(empirical, nnd[i, j]) / empirical.size uni, count = np.unique(ref_label[nni[i][pval[i] < cutoff]], return_counts=True) total_count = count.sum() if total_count < cmd_args.min_hits: prediction_dict[cutoff].append("rejected") continue argmax = np.argmax(count) if count[argmax] / total_count <= MAJORITY_THRESHOLD: prediction_dict[cutoff].append("ambiguous") continue prediction_dict[cutoff].append(uni[argmax]) prediction_dict[cutoff] = np.array(prediction_dict[cutoff]) if time_per_cell is None: time_per_cell = (time.time() - start_time) * 1000 / len( prediction_dict[cutoff]) print("Time per cell: %.3fms" % time_per_cell) cb.message.info("Saving results...") if os.path.exists(cmd_args.output): os.remove(cmd_args.output) for cutoff in prediction_dict: cb.data.write_hybrid_path( prediction_dict[cutoff], "%s//prediction/%s" % (cmd_args.output, str(cutoff))) cb.data.write_hybrid_path(nni, "//".join((cmd_args.output, "nni"))) cb.data.write_hybrid_path(nnd, "//".join((cmd_args.output, "nnd"))) cb.data.write_hybrid_path(pval, "//".join((cmd_args.output, "pval"))) cb.data.write_hybrid_path(time_per_cell, "//".join( (cmd_args.output, "time")))
config = logging_confdict(working_folder, __name__ + "_cleanup") logging.config.dictConfig(config) arxiv_logger = logging.getLogger(__name__ + "_cleanup") # Read in stage_1 raw file try: stage_1_raw = pd.read_json(working_folder + "/stage_1_raw.json") except Exception, e: arxiv_logger.exception("Could not load stage_1_raw file. Exiting...") sys.exit("Could not load stage_1_raw file") else: arxiv_logger.info("Stage_1_raw successfully loaded") if not remove_columns: remove_columns = eval(Config.get('data_settings', 'remove_cols')) stage_1 = clean_dataset(stage_1_raw, arxiv_logger, earliest_date, latest_date, remove_columns) stage_1['submitted'] = pd.to_datetime(stage_1['submitted'], unit="ms") arxiv_ids = [] for original_arxiv in stage_1['id'].values: found_regex = regex_new_arxiv.findall(original_arxiv) if found_regex: arxiv_id = found_regex[0] else: found_regex = regex_old_arxiv.findall(original_arxiv) if found_regex: arxiv_id = found_regex[0] else: arxiv_id = "parse_failed" arxiv_ids.append(arxiv_id) stage_1['arxiv_id'] = pd.Series(arxiv_ids, index=stage_1.index)
# by caozj # Jan 23, 2020 # 11:40:37 AM import argparse import numpy as np import Cell_BLAST as cb import utils # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", dest="input", type=str, required=True) parser.add_argument("-o", "--output", dest="output", type=str, required=True) parser.add_argument("-g", "--genes", dest="genes", type=str, default=None) parser.add_argument("--clean", dest="clean", type=str, default=None) cmd_args = parser.parse_args() # Read data print("Reading data...") x = cb.data.ExprDataSet.read_dataset(cmd_args.input).normalize() if cmd_args.clean: x = utils.clean_dataset(x, cmd_args.clean) if cmd_args.genes is not None: x = cb.utils.densify(np.log1p(x[:, x.uns[cmd_args.genes]].exprs)) # Save result cb.data.write_hybrid_path(x, "%s//exprs" % cmd_args.output) cb.data.write_hybrid_path(0, "%s//time" % cmd_args.output) print("Done!")
def main(cmd_args): cb.message.info("Reading data...") dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input) if not cmd_args.no_normalize: dataset = dataset.normalize() if cmd_args.clean: dataset = utils.clean_dataset(dataset, cmd_args.clean) if cmd_args.supervision is not None and cmd_args.label_fraction is not None: label = dataset.obs[cmd_args.supervision] if cmd_args.label_priority is not None: label_priority = dataset.obs[cmd_args.label_priority].values else: _label_priority = np.random.uniform(size=label.shape[0]) label_priority = np.empty(len(_label_priority)) for l in np.unique(label): # Group percentile mask = label == l label_priority[mask] = (scipy.stats.rankdata( _label_priority[mask]) - 1) / (mask.sum() - 1) exclude_mask = label_priority < np.percentile( label_priority, (1 - cmd_args.label_fraction) * 100) dataset.obs.loc[exclude_mask, cmd_args.supervision] = np.nan latent_module_kwargs = dict(lambda_reg=cmd_args.lambda_prior_reg) if cmd_args.supervision is not None: latent_module_kwargs["lambda_sup"] = cmd_args.lambda_sup prob_module_kwargs = dict(lambda_reg=cmd_args.lambda_prob_reg) rmbatch_module_kwargs = dict(lambda_reg=cmd_args.lambda_rmbatch_reg) os.environ["CUDA_VISIBLE_DEVICES"] = utils.pick_gpu_lowest_memory() \ if cmd_args.device is None else cmd_args.device start_time = time.time() model = cb.directi.fit_DIRECTi( dataset, genes=None if cmd_args.genes is None else dataset.uns[cmd_args.genes], latent_dim=cmd_args.latent_dim, cat_dim=cmd_args.cat_dim, supervision=cmd_args.supervision, batch_effect=cmd_args.batch_effect, h_dim=cmd_args.h_dim, depth=cmd_args.depth, prob_module=cmd_args.prob_module, rmbatch_module=cmd_args.rmbatch_module, latent_module_kwargs=latent_module_kwargs, prob_module_kwargs=prob_module_kwargs, rmbatch_module_kwargs=rmbatch_module_kwargs, optimizer=cmd_args.optimizer, learning_rate=cmd_args.learning_rate, batch_size=cmd_args.batch_size, val_split=cmd_args.val_split, epoch=cmd_args.epoch, patience=cmd_args.patience, progress_bar=True, random_seed=cmd_args.seed, path=cmd_args.output_path) model.save() cb.message.info("Saving results...") inferred_latent = model.inference(dataset) cb.data.write_hybrid_path(time.time() - start_time, "%s//time" % cmd_args.output) if "exclude_mask" in globals(): cb.data.write_hybrid_path(~exclude_mask, "%s//supervision" % cmd_args.output) cb.data.write_hybrid_path(inferred_latent, "%s//latent" % cmd_args.output) try: # If intrinsic clustering is used cb.data.write_hybrid_path( model.clustering(dataset)[0], "%s//cluster" % cmd_args.output) except Exception: pass
observe_dif_times = args.observe_loss_sequence_length terminate_threshold = args.terminate_threshold np.random.seed(2019) if __name__ == '__main__': dataset_train, dataset_test = handout.get_text_classification_datasets() categories = dataset_train.target_names # training data and labels training_data = (dataset_train.data) training_labels = np.array((dataset_train.target)) clean_training_data = utils.clean_dataset(training_data) mapping_dict = utils.build_mapping_dict(clean_training_data) feature_vector = utils.data2vec(clean_training_data, mapping_dict) print(len(feature_vector[0])) # build model softmax_model = model.Softmax_CrossEntropy_model(class_num=len(categories), feature_length=feature_vector.shape[1], learning_rate=learning_rate, regularization_rate=regularization_rate) present_epoch = 0 example_num = len(feature_vector) step = 0 # initial auto_terminate import queue
def main(): parser = argparse.ArgumentParser(description = 'BIDAF') parser.add_argument('file', type = str, help = 'the test file') parser.add_argument('--question_maxlen', default = 25, type = int) parser.add_argument('--context_maxlen', default = 400, type = int) parser.add_argument('--word_maxlen', default = 15, type = int) parser.add_argument('--batch_size', default = 10, type = int) parser.add_argument('--word_tokenizer', default = 'utils/tokenizers/word_tokenizer.pkl', type = str, help = 'path to the word_tokenizer') parser.add_argument('--char_tokenizer', default = 'utils/tokenizers/char_tokenizer.pkl', type = str, help = 'path to the char_tokenizer') parser.add_argument('--output_file', default = 'predictions.json', type = str, help = 'path to the output file') parser.add_argument('--weights', default = 'utils/models/weights/bidaf_weights', type = str, help = 'path to the weights') parser.add_argument('--embedding_size', default = 300, type = int) parser.add_argument('--embedding_matrix', default = 'utils/data/embedding.npy', type = str, help = 'path to the embedding matrix npy file') parser.add_argument('--learning_rate', default = 0.0005, type = float) parser.add_argument('--filter_size', default = 3, type = int) parser.add_argument('--char_embedding_size', default = 8, type = int) parser.add_argument('--epochs', default = 10, type = int) args = parser.parse_args() QUESTION_MAXLEN = args.question_maxlen CONTEXT_MAXLEN = args.context_maxlen WORD_MAXLEN = args.word_maxlen BATCH_SIZE = args.batch_size LR = args.learning_rate EMBEDDING_SIZE = args.embedding_size N_FILTERS = EMBEDDING_SIZE CHAR_EMBEDDING_SIZE = args.char_embedding_size EPOCHS = args.epochs FILTER_SIZE = args.filter_size curr = os.getcwd() filepath = os.path.join(curr, args.file) output_path = os.path.join(curr, args.output_file) word_tokenizer_path = os.path.join(curr, args.word_tokenizer) char_tokenizer_path = os.path.join(curr, args.char_tokenizer) weights_path = os.path.join(curr, args.weights) embedding_matrix_path = os.path.join(curr, args.embedding_matrix) with open(word_tokenizer_path, 'rb') as word_handle: word_tokenizer = pickle.load(word_handle) with open(char_tokenizer_path, 'rb') as char_handle: char_tokenizer = pickle.load(char_handle) embedding_matrix = np.load(embedding_matrix_path) WORD_VOCAB_LEN = len(word_tokenizer.word_index) + 1 CHAR_VOCAB_LEN = char_tokenizer.num_words dataset = load_dataset(filepath, with_answer = False) SAMPLES = dataset.shape[0] print('[INFO] cleaning data...') dataset = clean_dataset(dataset, with_answer = False) print('[INFO] done !') print('[INFO] tokenizing data...') dataset = tokenize(dataset, word_tokenizer, char_tokenizer) print('[INFO] done !') dataset = dataset[(dataset['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (dataset['tokenized_context'].str.len() <= CONTEXT_MAXLEN)].reset_index(drop = True) print(f'[PREPROCESSING] we get rid of : {SAMPLES - dataset.shape[0]} samples') dataset = SQUAD_dataset(dataset, batch_size = BATCH_SIZE, question_maxlen = QUESTION_MAXLEN, context_maxlen = CONTEXT_MAXLEN, word_maxlen = WORD_MAXLEN, with_answer = False) bidaf_model = BIDAF( QUESTION_MAXLEN, CONTEXT_MAXLEN, WORD_VOCAB_LEN, EMBEDDING_SIZE, embedding_matrix, CHAR_VOCAB_LEN, WORD_MAXLEN, N_FILTERS, FILTER_SIZE, CHAR_EMBEDDING_SIZE, word_tokenizer_path, char_tokenizer_path) bidaf_model.load_weights(weights_path) print('[INFO] making predictions...') bidaf_model.multi_predictions([dataset], output_path)
config = logging_confdict(working_folder, __name__ + "_cleanup") logging.config.dictConfig(config) cr_logger = logging.getLogger(__name__ + "_cleanup") # Read in stage_1 raw file try: stage_2_raw = pd.read_json(working_folder + "/stage_2_raw.json") except Exception, e: cr_logger.exception("Could not load stage_1_raw file") sys.exit("Could not load stage 2 raw") else: cr_logger.info("Stage_1_raw successfully loaded") if not remove_columns: remove_columns = eval(Config.get('data_settings', 'remove_cols')) stage_2 = clean_dataset(stage_2_raw, cr_logger, earliest_date, latest_date, remove_columns) cr_unique_dois = stage_2.cr_doi.unique() arxiv_unique_dois = stage_2.doi.unique() common = set(cr_unique_dois) & set(arxiv_unique_dois) cr_logger.info("cr:{}, arxiv:{}, common:{}".format(len(cr_unique_dois), len(arxiv_unique_dois), len(common))) stage_2_no_nan = stage_2[[elem is not np.nan for elem in stage_2.cr_doi]] multiple_dois_bool = stage_2_no_nan.cr_doi.duplicated() multiple_dois = stage_2_no_nan[multiple_dois_bool].cr_doi bad_indices = [] good_indices = []
def main(cmd_args): dataset = cb.data.ExprDataSet.read_dataset(cmd_args.input, sparsify=True) if cmd_args.clean is not None: dataset = utils.clean_dataset(dataset, cmd_args.clean) if cmd_args.genes is not None: dataset = dataset[:, dataset.uns[cmd_args.genes]] if cmd_args.batch_effect is not None: batch_indices = sklearn.preprocessing.LabelEncoder().fit_transform( dataset.obs[cmd_args.batch_effect]) if cmd_args.supervision is not None: labels = sklearn.preprocessing.LabelEncoder().fit_transform( dataset.obs[cmd_args.supervision]) if cmd_args.label_fraction is not None: if cmd_args.label_priority is not None: label_priority = dataset.obs[cmd_args.label_priority] else: _label_priority = np.random.uniform(size=labels.size) label_priority = np.empty(len(_label_priority)) for l in np.unique(labels): # Group percentile mask = labels == l label_priority[mask] = ( scipy.stats.rankdata(_label_priority[mask]) - 1 ) / (mask.sum() - 1) if cmd_args.label_fraction == 1.0: # Remove a small number of labelled cells to avoid empty # unlabelled set, which will lead to a crash. cmd_args.label_fraction = 0.99 labelled_indices = np.where(label_priority >= np.percentile( label_priority, (1 - cmd_args.label_fraction) * 100 ))[0] else: labelled_indices = np.arange(labels.size) dataset.to_anndata().write_h5ad(os.path.join(cmd_args.output_path, "data.h5ad")) dataset = scvi.dataset.AnnDataset("data.h5ad", save_path=cmd_args.output_path + "/") start_time = time.time() model_kwargs = dict( n_latent=cmd_args.n_latent, n_hidden=cmd_args.n_hidden, n_layers=cmd_args.n_layers ) trainer_kwargs = dict( use_cuda=True, metrics_to_monitor=["ll"], frequency=5, early_stopping_kwargs=dict( early_stopping_metric="ll", save_best_state_metric="ll", patience=cmd_args.patience, threshold=0 ) ) if cmd_args.batch_effect is not None: dataset.batch_indices, dataset.n_batches = \ batch_indices.reshape((-1, 1)), np.unique(batch_indices).size model_kwargs["n_batch"] = dataset.n_batches if cmd_args.supervision is not None: print("Using SCANVI...") dataset.labels, dataset.n_labels = \ labels.reshape((-1, 1)), np.unique(labels).size vae = scvi.models.SCANVI( dataset.nb_genes, n_labels=dataset.n_labels, **model_kwargs) # trainer_kwargs["early_stopping_kwargs"]["on"] = "unlabelled_set" trainer = scvi.inference.annotation.CustomSemiSupervisedTrainer( vae, dataset, labelled_indices, **trainer_kwargs) else: print("Using VAE...") vae = scvi.models.VAE(dataset.nb_genes, **model_kwargs) trainer = scvi.inference.UnsupervisedTrainer( vae, dataset, **trainer_kwargs) trainer.train(n_epochs=cmd_args.n_epochs, lr=cmd_args.lr) cb.data.write_hybrid_path( time.time() - start_time, "//".join([cmd_args.output, "time"]) ) latent = trainer.get_all_latent_and_imputed_values()["latent"] cb.data.write_hybrid_path(latent, "//".join([cmd_args.output, "latent"])) torch.save(vae, os.path.join(cmd_args.output_path, "model.pickle")) os.remove(os.path.join(cmd_args.output_path, "data.h5ad"))