def main(): output = pd.ExcelWriter(snakemake.output[0]) for i, data_file in enumerate(snakemake.input["data"]): l = cb.data.read_hybrid_path("{file}//obs/{label}".format( file=data_file, label=snakemake.config["label"])) b = cb.data.read_hybrid_path("{file}//obs/{batch}".format( file=data_file, batch=snakemake.config["batch"])) mask = utils.na_mask(l) l, b = l[~mask], b[~mask] df_list = [] for _b in np.unique(b): _l = l[b == _b] uniq, population = np.unique(_l, return_counts=True) proportion = population / population.sum() df = pd.DataFrame({ "population": np.vectorize(str)(population), "proportion": np.vectorize(lambda x: "%.1f%%" % x)(proportion * 100), snakemake.config["label"]: uniq }) df[str(_b)] = df["population"] + " (" + df["proportion"] + ")" del df["population"], df["proportion"] df_list.append(df) df = functools.reduce(lambda x, y: pd.merge( x, y, how="outer", on=snakemake.config["label"] ), df_list).fillna("0 (0.0%)") sheet_name = "group_%d" % (i + 1) df.to_excel(output, sheet_name=sheet_name, index=False) output.save()
def main(): y = cb.data.read_hybrid_path("//".join( [snakemake.input.data, "obs/%s" % snakemake.config["label"]])) mask = utils.na_mask(y) y = y[~mask] y = cb.utils.encode_integer(y)[0] b = cb.data.read_hybrid_path("//".join( [snakemake.input.data, "obs/%s" % snakemake.config["batch"]])) b = b[~mask] b = cb.utils.encode_integer(b)[0] x = cb.data.read_hybrid_path("//".join( [snakemake.input.result, snakemake.params.slot])) performance = dict( nearest_neighbor_accuracy=cb.metrics.nearest_neighbor_accuracy(x, y), mean_average_precision=cb.metrics.mean_average_precision_from_latent( x, y, k=snakemake.config["nn"]), seurat_alignment_score=cb.metrics.seurat_alignment_score( x, b, n=10, k=snakemake.config["nn"]), batch_mixing_entropy=cb.metrics.batch_mixing_entropy(x, b), time=float( cb.data.read_hybrid_path("//".join( [snakemake.input.result, "time"]))), # "Null" have time = 0 read as np.int64 n_cell=x.shape[0]) with open(snakemake.output[0], "w") as f: json.dump(performance, f, indent=4)
def main(): y = cb.data.read_hybrid_path("//".join( [snakemake.input.data, "obs/%s" % snakemake.config["label"]])) y = y[~utils.na_mask(y)] y = cb.utils.encode_integer(y)[0] x = cb.data.read_hybrid_path("//".join([snakemake.input.result, "latent"])) performance = dict( nearest_neighbor_accuracy=cb.metrics.nearest_neighbor_accuracy(x, y), mean_average_precision=cb.metrics.mean_average_precision_from_latent( x, y, k=snakemake.config["nn"]), time=cb.data.read_hybrid_path("//".join( [snakemake.input.result, "time"])), n_cell=x.shape[0]) with open(snakemake.output[0], "w") as f: json.dump(performance, f, indent=4)
def main(): ref = np.concatenate([ cb.data.read_hybrid_path("{file}//obs/{label}".format( file=item, label=snakemake.config["label"])) for item in snakemake.input.ref ]) ref = ref[~utils.na_mask(ref)] pos_types = np.unique(ref) expect = pd.read_csv(snakemake.params.expect, index_col=0) # # Pos/neg weighed # true = np.concatenate([cb.data.read_hybrid_path("{file}//obs/{label}".format( # file=item, label=snakemake.config["label"] # )) for item in snakemake.input.true]) # true = true[~utils.na_mask(true)] # tp = np.in1d(true, pos_types) # tn = ~tp # weight = np.ones(true.size) # weight[tp] = 1 / tp.sum() # weight[tn] = 1 / tn.sum() # weight /= weight.sum() / weight.size # Dataset weighed true = [ cb.data.read_hybrid_path("{file}//obs/{label}".format( file=item, label=snakemake.config["label"])) for item in snakemake.input.true ] true = [item[~utils.na_mask(item)] for item in true] weight = np.concatenate( [np.repeat(1 / item.size, item.size) for item in true]) weight /= weight.sum() / weight.size true = np.concatenate(true) tp = np.in1d(true, pos_types) tn = ~tp pred_dict = collections.defaultdict(list) for item in snakemake.input.pred: with h5py.File(item, "r") as f: g = f["prediction"] for threshold in g: pred_dict[float(threshold)].append( cb.data.read_clean(g[threshold][...])) cell_type_specific_excel = pd.ExcelWriter(snakemake.output[1]) performance = [] for threshold in sorted(pred_dict.keys(), key=float): pred = pred_dict[threshold] = np.concatenate(pred_dict[threshold]) assert len(pred) == len(true) pn = np.vectorize(lambda x: x in ("unassigned", "ambiguous", "rejected"))(pred) pp = ~pn sensitivity = (weight * np.logical_and(tp, pp)).sum() / (weight * tp).sum() specificity = (weight * np.logical_and(tn, pn)).sum() / (weight * tn).sum() class_specific_accuracy = cb.metrics.class_specific_accuracy( true, pred, expect) class_specific_accuracy.insert( 0, "positive", np.in1d(class_specific_accuracy.index, pos_types)) pos_mba = class_specific_accuracy.loc[ class_specific_accuracy["positive"], "accuracy"].mean() neg_mba = class_specific_accuracy.loc[ ~class_specific_accuracy["positive"], "accuracy"].mean() mba = (pos_mba + neg_mba) / 2 performance.append( dict(ref_size=ref.size, threshold=threshold, sensitivity=sensitivity, specificity=specificity, pos_mba=pos_mba, neg_mba=neg_mba, mba=mba)) class_specific_accuracy.to_excel(cell_type_specific_excel, str(threshold), index_label=snakemake.config["label"]) cell_type_specific_excel.save() with open(snakemake.output[0], "w") as f: json.dump(performance, f, indent=4)