Beispiel #1
0
def do_one(args):
    matrix_type = args.matrix_type
    rbp_ps = args.rbp_ps
    examine_tops = args.examine_tops

    result = subprocess.call("Rscript --version 2> /dev/null", shell=True)
    if result == 127:
        sys.stderr.write("Rscript is not found")
        sys.stderr.flush()
        sys.exit(1)

    if not os.path.isdir(args.output):
        os.mkdir(args.output)

    log_file = "%s/%s" % (args.output, args.log_file)
    logger = logging.getLogger("prep_fast")
    logger.setLevel(logging.DEBUG)

    if not logger.hasHandlers():
        handler = logging.FileHandler(log_file)
        handler.setFormatter(
            logging.Formatter(
                "%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
        logger.addHandler(handler)
        if args.verbose:
            logger.addHandler(logging.StreamHandler())

    t_overwrite = args.overwrite_input_bin
    #if there is no binary input file, create one
    check_required = ["expr.npy", "Xcen.npy", "genes.npy"]
    for rbp_p in args.rbp_ps:
        check_required.append("t_matrix_%s_%.2f.npy" %
                              (args.matrix_type, rbp_p))
    for cr in check_required:
        if not os.path.isfile("%s/%s" % (args.output, cr)):
            t_overwrite = True
            break

    expr, Xcen, genes, t_matrix = None, None, None, None
    if t_overwrite:
        expr, genes, Xcen = read_matrix(f_expr=args.expr,
                                        f_Xcen=args.centroid,
                                        logger=logger)
        logger.info(
            "Calculate all pairwise Euclidean distance between cells using their physical coordinates"
        )
        euc = squareform(pdist(Xcen, metric="euclidean"))
        for rbp_p in args.rbp_ps:
            logger.info("For rbp_p %.2f:" % rbp_p)
            t_matrix = spatial_genes.rank_transform_matrix(
                euc,
                reverse=False,
                rbp_p=rbp_p,
                matrix_type=matrix_type,
                logger=logger)
            np.save(
                "%s/t_matrix_%s_%.2f.npy" %
                (args.output, args.matrix_type, rbp_p), t_matrix)
        np.save("%s/expr.npy" % args.output, expr)
        np.save("%s/Xcen.npy" % args.output, Xcen)
        np.save("%s/genes.npy" % args.output, genes)
    else:
        logger.info("Using existing input binaries...")
        expr = np.load("%s/expr.npy" % args.output)
        Xcen = np.load("%s/Xcen.npy" % args.output)
        genes = np.load("%s/genes.npy" % args.output)

    ncell = Xcen.shape[0]
    for rbp_p in args.rbp_ps:
        for examine_top in args.examine_tops:
            outdir = "%s/result_fast_sim_5000_%.2f_%.3f" % (args.output, rbp_p,
                                                            examine_top)
            if matrix_type == "dissim":
                outdir = "%s/result_fast_5000_%.2f_%.3f" % (args.output, rbp_p,
                                                            examine_top)
            if not os.path.isdir(outdir):
                os.mkdir(outdir)
            source_path = os.path.dirname(silhouetteRank.__file__)
            if not os.path.isfile("%s/do_kmeans.R" % outdir):
                copyfile("%s/do_kmeans.R" % source_path,
                         "%s/do_kmeans.R" % outdir)
            if not os.path.isfile("%s/do_gpd.R" % outdir):
                copyfile("%s/do_gpd.R" % source_path, "%s/do_gpd.R" % outdir)
            if not os.path.isfile("%s/qval.R" % outdir):
                copyfile("%s/qval.R" % source_path, "%s/qval.R" % outdir)
Beispiel #2
0
def do_one(args):
    matrix_type = args.matrix_type
    rbp_p = args.rbp_p

    if not os.path.isdir(args.output):
        os.mkdir(args.output)

    logdir = "%s/logs.fast" % args.output
    if not os.path.isdir(logdir):
        os.mkdir(logdir)
    log_file = "%s/real_%.2f_%.3f.out" % (logdir, args.rbp_p, args.examine_top)
    logger = logging.getLogger("real_fast_%.2f_%.3f" %
                               (args.rbp_p, args.examine_top))
    logger.setLevel(logging.DEBUG)

    if not logger.hasHandlers():
        handler = logging.FileHandler(log_file, "w")
        handler.setFormatter(
            logging.Formatter(
                "%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
        logger.addHandler(handler)
        if args.verbose:
            logger.addHandler(logging.StreamHandler())

    t_overwrite = args.overwrite_input_bin
    check_required = [
        "expr.npy", "Xcen.npy", "genes.npy",
        "t_matrix_%s_%.2f.npy" % (args.matrix_type, args.rbp_p)
    ]
    for cr in check_required:
        if not os.path.isfile("%s/%s" % (args.output, cr)):
            t_overwrite = True
            break

    expr, Xcen, genes, t_matrix = None, None, None, None
    if t_overwrite:
        expr, Xcen, genes = read(f_expr=args.expr,
                                 f_Xcen=args.centroid,
                                 logger=logger)
        logger.info(
            "Calculate all pairwise Euclidean distance between cells using their physical coordinates"
        )
        euc = squareform(pdist(Xcen, metric="euclidean"))
        logger.info(
            "Rank transform euclidean distance, and then apply exponential transform"
        )
        t_matrix = spatial_genes.rank_transform_matrix(euc,
                                                       reverse=False,
                                                       rbp_p=rbp_p,
                                                       matrix_type=matrix_type,
                                                       logger=logger)
        np.save(
            "%s/t_matrix_%s_%.2f.npy" %
            (args.output, args.matrix_type, args.rbp_p), t_matrix)
        np.save("%s/expr.npy" % args.output, expr)
        np.save("%s/Xcen.npy" % args.output, Xcen)
        np.save("%s/genes.npy" % args.output, genes)
    else:
        logger.info("Using existing input binaries...")
        expr = np.load("%s/expr.npy" % args.output)
        Xcen = np.load("%s/Xcen.npy" % args.output)
        genes = np.load("%s/genes.npy" % args.output)
        t_matrix = np.load("%s/t_matrix_%s_%.2f.npy" %
                           (args.output, args.matrix_type, args.rbp_p))

    logger.info("Compute silhouette metric per gene using fast method")
    examine_top = args.examine_top

    for t_trial in range(args.num_trials):
        res = spatial_genes.calc_silhouette_per_gene_approx(
            genes=genes,
            expr=expr,
            matrix=t_matrix,
            matrix_type=matrix_type,
            examine_top=examine_top,
            logger=logger)
        if matrix_type == "sim":
            f_name = "%s/silhouette.sim.fast.rbp.%.2f.top.%.3f.%d.txt" % (
                args.output, rbp_p, examine_top, t_trial)
        else:
            f_name = "%s/silhouette.fast.rbp.%.2f.top.%.3f.%d.txt" % (
                args.output, rbp_p, examine_top, t_trial)
        fw = open(f_name, "w")
        for ind, v in enumerate(res):
            fw.write("%d\t%s\t%.10f\n" % (ind, v[0], v[1]))
        fw.close()
Beispiel #3
0
			continue
		
		scores = []
		f = open("%s/%d" % (outdir, target))
		for l in f:
			l = l.rstrip("\n")
			scores.append(float(l))
		f.close()
		if len(scores)!=5000:
			size_to_do.append(target)

	if size_to_do==[]:
		sys.exit(0)

	expr, genes, Xcen = read_matrix()
	ncell = Xcen.shape[0] 

	sys.stdout.write("Calculate all pairwise Euclidean distance between cells using their physical coordinates\n")
	euc = squareform(pdist(Xcen, metric="euclidean"))
	sys.stdout.write("Rank transform euclidean distance, and then apply exponential transform\n")
	t_matrix = spatial_genes.rank_transform_matrix(euc, reverse=False, rbp_p=rbp_p, matrix_type=matrix_type)
	sys.stdout.write("Compute silhouette metric per gene\n")
	source_path = os.path.dirname(silhouetteRank.__file__)
	if not os.path.isfile("%s/do_gpd.R" % outdir):
		copyfile("%s/do_gpd.R" % source_path, "%s/do_gpd.R" % outdir)
	if not os.path.isfile("%s/do_kmeans.R" % outdir):
		copyfile("%s/do_kmeans.R" % source_path, "%s/do_kmeans.R" % outdir)


	res = spatial_genes.random_pattern(matrix=t_matrix, matrix_type=matrix_type, num_cell = ncell, sizes=size_to_do, trials_per_gene=5000, run_gpd=True, outdir=outdir)
def do_one(args):
    matrix_type = args.matrix_type
    rbp_p = args.rbp_p
    examine_top = args.examine_top
    #matrix_type = "dissim" # sim or dissim
    result = subprocess.call("Rscript --version 2> /dev/null", shell=True)
    if result == 127:
        sys.stderr.write("Rscript is not found")
        sys.stderr.flush()
        sys.exit(1)

    if not os.path.isdir(args.output):
        os.mkdir(args.output)

    logdir = "%s/logs" % args.output
    if not os.path.isdir(logdir):
        os.mkdir(logdir)
    log_file = "%s/%.2f_%.3f.out" % (logdir, args.rbp_p, args.examine_top)
    logger = logging.getLogger("random_%.2f_%.3f" %
                               (args.rbp_p, args.examine_top))
    logger.setLevel(logging.DEBUG)

    t_overwrite = args.overwrite_input_bin
    #if there is no binary input file, create one
    check_required = [
        "expr.npy", "Xcen.npy", "genes.npy",
        "t_matrix_%s_%.2f.npy" % (args.matrix_type, args.rbp_p)
    ]
    for cr in check_required:
        if not os.path.isfile("%s/%s" % (args.output, cr)):
            t_overwrite = True
            break

    expr, Xcen, genes, t_matrix = None, None, None, None
    if t_overwrite:
        expr, genes, Xcen = read_matrix(f_expr=args.expr,
                                        f_Xcen=args.centroid,
                                        logger=logger)
        logger.info(
            "Calculate all pairwise Euclidean distance between cells using their physical coordinates"
        )
        euc = squareform(pdist(Xcen, metric="euclidean"))
        logger.info(
            "Rank transform euclidean distance, and then apply exponential transform"
        )
        t_matrix = spatial_genes.rank_transform_matrix(euc,
                                                       reverse=False,
                                                       rbp_p=rbp_p,
                                                       matrix_type=matrix_type,
                                                       logger=logger)
        np.save(
            "%s/t_matrix_%s_%.2f.npy" %
            (args.output, args.matrix_type, args.rbp_p), t_matrix)
        np.save("%s/expr.npy" % args.output, expr)
        np.save("%s/Xcen.npy" % args.output, Xcen)
        np.save("%s/genes.npy" % args.output, genes)
    else:
        logger.info("Using existing input binaries...")
        expr = np.load("%s/expr.npy" % args.output)
        Xcen = np.load("%s/Xcen.npy" % args.output)
        genes = np.load("%s/genes.npy" % args.output)
        t_matrix = np.load("%s/t_matrix_%s_%.2f.npy" %
                           (args.output, args.matrix_type, args.rbp_p))

    #expr, genes, Xcen = read_matrix()
    ncell = Xcen.shape[0]

    #sys.stdout.write("Calculate all pairwise Euclidean distance between cells using their physical coordinates\n")
    #euc = squareform(pdist(Xcen, metric="euclidean"))
    #sys.stdout.write("Rank transform euclidean distance, and then apply exponential transform\n")
    #t_matrix = spatial_genes.rank_transform_matrix(euc, reverse=False, rbp_p=rbp_p, matrix_type=matrix_type)
    logger.info("Compute silhouette metric per gene")

    outdir = "%s/result_sim_5000_%.2f_%.3f" % (args.output, rbp_p, examine_top)
    if matrix_type == "dissim":
        outdir = "%s/result_5000_%.2f_%.3f" % (args.output, rbp_p, examine_top)
    if not os.path.isdir(outdir):
        os.mkdir(outdir)
    source_path = os.path.dirname(silhouetteRank.__file__)
    if not os.path.isfile("%s/do_gpd.R" % outdir):
        copyfile("%s/do_gpd.R" % source_path, "%s/do_gpd.R" % outdir)
    if not os.path.isfile("%s/do_kmeans.R" % outdir):
        copyfile("%s/do_kmeans.R" % source_path, "%s/do_kmeans.R" % outdir)
    sizes = read_frequency(expr=expr, genes=genes, Xcen=Xcen, frequency_file=None, \
    read_from_file=False, outdir=outdir, examine_top=examine_top, num_query_sizes=args.query_sizes)
    res = spatial_genes.random_pattern(matrix=t_matrix,
                                       matrix_type=matrix_type,
                                       num_cell=ncell,
                                       sizes=sizes,
                                       trials_per_gene=5000,
                                       run_gpd=True,
                                       outdir=outdir,
                                       logger=logger)