def main(): parser = argparse.ArgumentParser(description=""" Run nmf_pathway.py with different _r_andom _r_estarts _a_nd _e_dges. Pass through arguments to nmf_pathway.py except for --condor, --manifolds-init, --n-runs, and --outdir TODO ---- Capture --manifolds and --manifolds-file and randomize the edges before passing to nmf_pathway.py """) parser.add_argument("--n-runs", type=int, default=2, help="Number of random restarts") parser.add_argument("--condor", action='store_true', help='If flag is provided, submit jobs to Condor rather than running them on this machine') prmf_args.add_prmf_arguments(parser) args = parser.parse_args() job_graph = nx.DiGraph() job_id = 0 args_dict = vars(args) outdir = args_dict.pop('outdir') condor = args_dict.pop('condor') # TODO what if manifolds-file is provided instead? manifolds = args_dict.pop('manifolds') args_dict.pop('manifolds_file') args_dict.pop('manifolds_init') n_runs = args_dict.pop('n_runs') for i in range(n_runs): run_outdir = os.path.join(outdir, "run{}".format(i)) os.mkdir(run_outdir) random_pathway_dir = os.path.join(run_outdir, 'random_pathways') os.mkdir(random_pathway_dir) attrs = { 'exe': 'randomize_network.R', 'args': ['--infiles'] + manifolds + ['--outdir', random_pathway_dir], 'out': os.path.join(random_pathway_dir, 'randomize_network.out'), 'err': os.path.join(random_pathway_dir, 'randomize_network.err') } randomize_network_job_id = job_id job_graph.add_node(randomize_network_job_id, attrs) job_id += 1 pathways_file = os.path.join(random_pathway_dir, 'pathways_file.txt') args_list = script_utils.args_to_list(args_dict) args_list = args_list + ['--outdir', run_outdir, '--manifolds-init', '--manifolds-file', pathways_file] attrs = { 'exe': 'nmf_pathway.py', 'args': args_list, 'out': os.path.join(run_outdir, 'nmf_pathway.out'), 'err': os.path.join(run_outdir, 'nmf_pathway.err') } nmf_pathway_job_id = job_id job_graph.add_node(nmf_pathway_job_id, attrs) job_graph.add_edge(randomize_network_job_id, nmf_pathway_job_id) job_id += 1 job_ids = script_utils.run_digraph(outdir, job_graph, condor=condor)
def main(): parser = argparse.ArgumentParser(description=""" Run nmf_pathway.py with different _r_andom _r_estarts. Pass through arguments to nmf_pathway.py except for --condor, --manifolds-init, --n-runs, and --outdir """) parser.add_argument("--n-runs", type=int, default=2, help="Number of random restarts") parser.add_argument( "--condor", action='store_true', help= 'If flag is provided, submit jobs to Condor rather than running them on this machine' ) prmf_args.add_prmf_arguments(parser) args = parser.parse_args() job_graph = nx.DiGraph() job_id = 0 args_dict = vars(args) outdir = args_dict.pop('outdir') condor = args_dict.pop('condor') args_dict.pop('manifolds_init') n_runs = args_dict.pop('n_runs') for i in range(n_runs): run_outdir = os.path.join(outdir, "run{}".format(i)) args_list = script_utils.args_to_list(args_dict) args_list = args_list + ['--outdir', run_outdir, '--manifolds-init'] os.mkdir(run_outdir) attrs = { 'exe': 'nmf_pathway.py', 'args': args_list, 'out': os.path.join(run_outdir, 'nmf_pathway.out'), 'err': os.path.join(run_outdir, 'nmf_pathway.err') } job_graph.add_node(job_id, attrs) job_id += 1 job_ids = script_utils.run_digraph(outdir, job_graph, condor=condor)
def main(): parser = argparse.ArgumentParser(description=""" Evalute nmf_pathway.py using true pathways against randomized pathways for a "real" dataset instead of a simulated one """) parser.add_argument("--rng-seed", help="Seed for random number generators", default=None) parser.add_argument( "--condor", action='store_true', help="Flag which indicates we should submit jobs to Condor") # prepare data parser.add_argument("--data", help="CSV", required=True) parser.add_argument( "--nodelist", help="Contains node identifiers for data and manifolds", required=True) # randomization # (run kegg.R then run randomize_network.R then prepare two separate files manifolds_file_true # and manifolds_file_random which contain filepaths to some of the kegg and some of randomized kegg, respectively) # TODO trouble is the selection of a subset of networks into a file and that randomize_network.R runs on directories # could be solved by specifying selection with KEGG pathway identifiers and a directory rather than by a filepath # factorization parser.add_argument("--manifolds-file-true", required=True) parser.add_argument("--manifolds-file-random", required=True) parser.add_argument("--gamma", default="1.0") parser.add_argument("--k-latent", default="6") # other arguments used in factorization: outdir, nodelist, data # evaluation # (no additional arguments) args = parser.parse_args() job_graph = nx.DiGraph() job_id = 0 # factorization # two branches: nmf_pathway on true pathways and nmf_pathway on randomized pathways # 1) nmf_pathway on true - {{ nmf_pathway_true_outdir = os.path.join(args.outdir, "nmf_pathway_true") os.mkdir(nmf_pathway_true_outdir) attrs = { 'exe': "nmf_pathway.py", 'args': [ "--data", args.data, '--k-latent', args.k_latent, "--manifolds-file", args.manifolds_file_true, "--nodelist", args.nodelist, "--node-attribute", "name", "--gamma", args.gamma, "--outdir", nmf_pathway_true_outdir ], 'out': os.path.join(nmf_pathway_true_outdir, "nmf_pathway.out"), 'err': os.path.join(nmf_pathway_true_outdir, "nmf_pathway.err") } if (args.rng_seed is not None): attrs['args'] += ['--seed', args.rng_seed] prmf_true_gene_by_latent_fp = os.path.join(nmf_pathway_true_outdir, "V.csv") prmf_true_job_id = job_id job_graph.add_node(prmf_true_job_id, attrs) job_graph.add_edge(diffusion_job_id, job_id) job_id += 1 # evaluation attrs = { 'exe': "evaluate_screen_sim.py", 'args': [ "--gene-by-latent", prmf_true_gene_by_latent_fp, "--nodelist", nodelist_fp, "--true-seeds", chosen_seeds_fp ], 'out': os.path.join(nmf_pathway_true_outdir, "evaluate.out"), 'err': os.path.join(nmf_pathway_true_outdir, "evaluate.err") } job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id - 1, job_id) job_id += 1 # }} - nmf_pathway # 2) nmf_pathway on random - {{ nmf_pathway_random_outdir = os.path.join(args.outdir, "nmf_pathway_random") os.mkdir(nmf_pathway_random_outdir) attrs = { 'exe': "nmf_pathway.py", 'args': [ "--data", diffused_fp, '--k-latent', args.k_latent, "--manifolds-file", args.manifolds_file_random, "--nodelist", nodelist_fp, "--node-attribute", "name", "--gamma", args.gamma, "--outdir", nmf_pathway_random_outdir ], 'out': os.path.join(nmf_pathway_random_outdir, "nmf_pathway.out"), 'err': os.path.join(nmf_pathway_random_outdir, "nmf_pathway.err") } if (args.rng_seed is not None): attrs['args'] += ['--seed', args.rng_seed] prmf_random_gene_by_latent_fp = os.path.join(nmf_pathway_random_outdir, "V.csv") prmf_random_job_id = job_id job_graph.add_node(prmf_random_job_id, attrs) job_graph.add_edge(diffusion_job_id, job_id) job_id += 1 # evaluation attrs = { 'exe': "evaluate_screen_sim.py", 'args': [ "--gene-by-latent", prmf_random_gene_by_latent_fp, "--nodelist", nodelist_fp, "--true-seeds", chosen_seeds_fp ], 'out': os.path.join(nmf_pathway_random_outdir, "evaluate.out"), 'err': os.path.join(nmf_pathway_random_outdir, "evaluate.err") } job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id - 1, job_id) job_id += 1 # }} - # plot # TODO rework name of arguments plot_outdir = os.path.join(args.outdir, 'pr_curves') os.mkdir(plot_outdir) attrs = { 'exe': 'plot_pr_curve.py', 'args': [ '--gene-by-latent-csvs', prmf_random_gene_by_latent_fp, prmf_true_gene_by_latent_fp, '--labels', 'PRMF_random', 'PRMF_real', '--nodelist', nodelist_fp, '--true-seeds', chosen_seeds_fp, '--outdir', plot_outdir ], 'out': os.path.join(plot_outdir, 'plot.out'), 'err': os.path.join(plot_outdir, 'plot.err') } job_graph.add_node(job_id, attrs) # run after nmf and nmf_pathway job_graph.add_edge(prmf_true_job_id, job_id) job_graph.add_edge(prmf_random_job_id, job_id) job_id += 1 condor = False if args.condor: condor = True job_ids = script_utils.run_digraph(args.outdir, job_graph, condor=condor)
def main(): parser = argparse.ArgumentParser(description= """ Evalute nmf_pathway.py by simulating gene lists and compare against nmf_init.py """) # run environment parser.add_argument("--rng-seed", help="Seed for random number generators", default=None) parser.add_argument("--condor", action='store_true') parser.add_argument("--dry-run", action='store_true') parser.add_argument("--do-nmf", default=True, type=script_utils.str2bool) parser.add_argument("--do-plier", default=True, type=script_utils.str2bool) parser.add_argument("--do-nbs", default=True, type=script_utils.str2bool) parser.add_argument("--do-cogaps", default=True, type=script_utils.str2bool) # simulation parser.add_argument("--n-gene-lists", help="Number of gene lists to simulate", type=int, default=6) parser.add_argument("--nodelist", help="Universe of node identifiers and an ordering on those identifiers", required=True) parser.add_argument("--seed-lists", required=True, nargs='+') # TODO future versions manifolds should be distinct from node lists derived from pathways parser.add_argument("--outdir", required=True) parser.add_argument("--simulator") parser.add_argument("--noise-pr", default="0.05") # diffusion parser.add_argument("--network", required=True) # other arguments used in diffusion: nodelist # factorization parser.add_argument("--cross-validation", help="Fraction of data in [0,1] to hold out and test reconstruction performance on") parser.add_argument("--manifolds-file", required=True) parser.add_argument("--gamma", default="1.0") parser.add_argument("--k-latent", default="6") # other arguments used in factorization: outdir, nodelist, data # evaluation # (no additional arguments) args = parser.parse_args() job_graph = nx.DiGraph() job_id = 0 # simulation attrs = { 'exe': "simulate_screens.py", 'args': ["--seed-lists"] + args.seed_lists + ["--n-gene-lists", str(args.n_gene_lists), "--nodelist", args.nodelist, "--outdir", args.outdir, '--simulator', args.simulator, '--noise-pr', args.noise_pr], 'out': os.path.join(args.outdir, "simulate_screens.out"), 'err': os.path.join(args.outdir, "simulate_screens.err"), 'env': 'prmf' } if(args.rng_seed is not None): attrs['args'] += ['--rng-seed', args.rng_seed] job_graph.add_node(job_id, attrs) simulation_job_id = job_id job_id += 1 sim_list_fps = [] for i in range(args.n_gene_lists): sim_list_fps.append(os.path.join(args.outdir, "sim_list_{}.txt".format(i+1))) chosen_seeds_fp = os.path.join(args.outdir, "chosen_seeds.txt") # diffusion diffused_fp = os.path.join(args.outdir, "diffused.csv") attrs = { 'exe': "diffusion.py", 'args': ["--network", args.network, "--nodelist", args.nodelist, "--gene-lists"] + sim_list_fps + ["--diffused", diffused_fp], 'out': os.path.join(args.outdir, "diffusion.out"), 'err': os.path.join(args.outdir, "diffusion.err"), 'env': 'prmf' } diffusion_job_id = job_id job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id-1, job_id) job_id += 1 # factorization # two branches: nmf and nmf_pathway # 1) nmf - {{ if args.do_nmf: nmf_outdir = os.path.join(args.outdir, "nmf") script_utils.mkdir_p(nmf_outdir) attrs = { 'exe': "nmf.py", 'args': ['--data', diffused_fp, '--k-latent', args.k_latent, '--outdir', nmf_outdir], 'out': os.path.join(nmf_outdir, 'nmf.out'), 'err': os.path.join(nmf_outdir, 'nmf.err'), 'env': 'prmf' } if(args.rng_seed is not None): attrs['args'] += ['--seed', args.rng_seed] if(args.cross_validation is not None): attrs['args'] += ['--cross-validation', args.cross_validation] nmf_gene_by_latent_fp = os.path.join(nmf_outdir, "V.csv") nmf_job_id = job_id job_graph.add_node(nmf_job_id, attrs) job_graph.add_edge(diffusion_job_id, nmf_job_id) job_id += 1 attrs = { 'exe': "evaluate_screen_sim.py", 'args': ["--gene-by-latent", nmf_gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp], 'out': os.path.join(nmf_outdir, "evaluate.out"), 'err': os.path.join(nmf_outdir, "evaluate.err"), 'env': 'prmf' } job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id-1, job_id) job_id += 1 # }} - nmf # 2) nmf_pathway - {{ nmf_pathway_outdir = os.path.join(args.outdir, "nmf_pathway") script_utils.mkdir_p(nmf_pathway_outdir) attrs = { 'exe': "nmf_pathway.py", 'args': ["--data", diffused_fp, '--k-latent', args.k_latent, "--manifolds-file", args.manifolds_file, "--node-attribute", "name", "--nodelist", args.nodelist, "--gamma", args.gamma, "--outdir", nmf_pathway_outdir], 'out': os.path.join(nmf_pathway_outdir, "nmf_pathway.out"), 'err': os.path.join(nmf_pathway_outdir, "nmf_pathway.err"), 'env': 'prmf' } if(args.rng_seed is not None): attrs['args'] += ['--seed', args.rng_seed] if(args.cross_validation is not None): attrs['args'] += ['--cross-validation', args.cross_validation] prmf_gene_by_latent_fp = os.path.join(nmf_pathway_outdir, "V.csv") prmf_job_id = job_id job_graph.add_node(prmf_job_id, attrs) job_graph.add_edge(diffusion_job_id, job_id) job_id += 1 # evaluation attrs = { 'exe': "evaluate_screen_sim.py", 'args': ["--gene-by-latent", prmf_gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp], 'out': os.path.join(nmf_pathway_outdir, "evaluate.out"), 'err': os.path.join(nmf_pathway_outdir, "evaluate.err"), 'env': 'prmf' } job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id-1, job_id) job_id += 1 # }} - nmf_pathway # 3) PLIER - {{ if args.do_plier: PLIER_outdir = os.path.join(args.outdir, "PLIER") script_utils.mkdir_p(PLIER_outdir) attrs = { 'exe': 'PLIER_wrapper.R', 'args': ['--data', diffused_fp, '--nodelist', args.nodelist, '--k-latent', args.k_latent, '--pathways-file', args.manifolds_file, '--node-attribute', 'name', '--outdir', PLIER_outdir], 'out': os.path.join(PLIER_outdir, "PLIER_wrapper.out"), 'err': os.path.join(PLIER_outdir, "PLIER_wrapper.err"), 'env': 'PLIER' } if(args.rng_seed is not None): attrs['args'] += ['--seed', args.rng_seed] PLIER_gene_by_latent_fp = os.path.join(PLIER_outdir, "Z.csv") PLIER_job_id = job_id job_graph.add_node(PLIER_job_id, attrs) job_graph.add_edge(diffusion_job_id, PLIER_job_id) job_id += 1 # evaluation attrs = { 'exe': "evaluate_screen_sim.py", 'args': ["--gene-by-latent", PLIER_gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp], 'out': os.path.join(PLIER_outdir, "evaluate.out"), 'err': os.path.join(PLIER_outdir, "evaluate.err"), 'env': 'prmf' } job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id-1, job_id) job_id += 1 # }} - PLIER # 4) NBS - {{ # NOTE NBS does its own diffusion based on binary somatic mutation profiles so we pass the simulated hits rather than our diffused data if args.do_nbs: NBS_outdir = os.path.join(args.outdir, 'NBS') script_utils.mkdir_p(NBS_outdir) attrs = { 'exe': 'pyNBS_wrapper.py', 'args': ['--nodelist', args.nodelist, '--gene-lists'] + sim_list_fps + ['--network', args.network, '--k-latent', args.k_latent, '--outdir', NBS_outdir], 'out': os.path.join(NBS_outdir, 'pyNBS_wrapper.out'), 'err': os.path.join(NBS_outdir, 'pyNBS_wrapper.err'), 'env': 'pyNBS' } NBS_job_id = job_id NBS_gene_by_latent_fp = os.path.join(NBS_outdir, "W.csv") job_graph.add_node(NBS_job_id, attrs) job_graph.add_edge(simulation_job_id, NBS_job_id) job_id += 1 # evaluation attrs = { 'exe': "evaluate_screen_sim.py", 'args': ["--gene-by-latent", NBS_gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp], 'out': os.path.join(NBS_outdir, "evaluate.out"), 'err': os.path.join(NBS_outdir, "evaluate.err"), 'env': 'prmf' } job_graph.add_node(job_id, attrs) job_graph.add_edge(NBS_job_id, job_id) job_id += 1 # }} - NBS # 5) CoGAPS - {{ if args.do_cogaps: CoGAPS_outdir = os.path.join(args.outdir, 'CoGAPS') script_utils.mkdir_p(CoGAPS_outdir) attrs = { 'exe': 'CoGAPS_wrapper.R', 'args': ['--data', diffused_fp, '--k-latent', args.k_latent, '--outdir', args.outdir], 'out': os.path.join(CoGAPS_outdir, 'CoGAPS_wrapper.out'), 'err': os.path.join(CoGAPS_outdir, 'CoGAPS_wrapper.err'), 'env': 'CoGAPS' } CoGAPS_job_id = job_id CoGAPS_gene_by_latent_fp = os.path.join(CoGAPS_outdir, "P.csv") job_graph.add_node(CoGAPS_job_id, attrs) job_graph.add_edge(diffusion_job_id, CoGAPS_job_id) job_id += 1 # evaluation attrs = { 'exe': "evaluate_screen_sim.py", 'args': ["--gene-by-latent", CoGAPS_gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp], 'out': os.path.join(CoGAPS_outdir, "evaluate.out"), 'err': os.path.join(CoGAPS_outdir, "evaluate.err"), 'env': 'prmf' } job_graph.add_node(job_id, attrs) job_graph.add_edge(NBS_job_id, job_id) job_id += 1 # }} - CoGAPS # plot plot_outdir = os.path.join(args.outdir, 'pr_curves') script_utils.mkdir_p(plot_outdir) gene_by_latent_csvs = [] labels = [] if args.do_nmf: labels.append('NMF') nmf_gene_by_latent_fp = os.path.join(nmf_outdir, "V.csv") gene_by_latent_csvs.append(nmf_gene_by_latent_fp) labels.append('PRMF') gene_by_latent_csvs.append(prmf_gene_by_latent_fp) if args.do_plier: labels.append("PLIER") PLIER_gene_by_latent_fp = os.path.join(PLIER_outdir, "Z.csv") gene_by_latent_csvs.append(PLIER_gene_by_latent_fp) if args.do_nbs: labels.append("NBS") NBS_gene_by_latent_fp = os.path.join(NBS_outdir, "W.csv") gene_by_latent_csvs.append(NBS_gene_by_latent_fp) if args.do_cogaps: labels.append("CoGAPS") CoGAPS_gene_by_latent_fp = os.path.join(CoGAPS_outdir, "P.csv") gene_by_latent_csvs.append(CoGAPS_gene_by_latent_fp) attrs = { 'exe': 'plot_pr_curve.py', 'args': [ '--gene-by-latent-csvs'] + gene_by_latent_csvs + ['--labels'] + labels + ['--nodelist', args.nodelist, '--true-seeds', chosen_seeds_fp, '--outdir', plot_outdir], 'out': os.path.join(plot_outdir, 'plot.out'), 'err': os.path.join(plot_outdir, 'plot.err'), 'env': 'prmf' } job_graph.add_node(job_id, attrs) # run after all methods if args.do_nmf: job_graph.add_edge(nmf_job_id, job_id) job_graph.add_edge(prmf_job_id, job_id) if args.do_plier: job_graph.add_edge(PLIER_job_id, job_id) if args.do_nbs: job_graph.add_edge(NBS_job_id, job_id) if args.do_cogaps: job_graph.add_edge(CoGAPS_job_id, job_id) job_id += 1 condor = False if args.condor: condor = True job_ids = script_utils.run_digraph(args.outdir, job_graph, condor=condor, dry_run=args.dry_run)
def main(): parser = argparse.ArgumentParser(description=""" Evalute nmf_pathway.py against true pathways and randomized pathways """) parser.add_argument("--rng-seed", help="Seed for random number generators", default=None) parser.add_argument( "--condor", action='store_true', help="Flag which indicates we should submit jobs to Condor") # prepare networks parser.add_argument( '--stringdb', help="STRINGdb database file e.g. 9606.protein.links.detailed.v10.txt", required=True) # simulation parser.add_argument("--n-gene-lists", help="Number of gene lists to simulate", type=int, default=6) parser.add_argument( "--seed-lists", required=True, nargs='+' ) # TODO future versions manifolds should be distinct from node lists derived from pathways parser.add_argument("--outdir", required=True) parser.add_argument("--simulator", default='mixture') # TODO check option value parser.add_argument("--noise-pr", default="0.05") # randomization # (run kegg.R then run randomize_network.R then prepare two separate files manifolds_file_true # and manifolds_file_random which contain filepaths to some of the kegg and some of randomized kegg, respectively) # TODO trouble is the selection of a subset of networks into a file and that kegg.R runs on directories # could be solved by specifying selection with KEGG pathway identifiers and a directory rather than by a filepath # diffusion # (no additional arguments) # other arguments used in diffusion: nodelist # factorization parser.add_argument("--manifolds-file-true", required=True) parser.add_argument("--manifolds-file-random", required=True) parser.add_argument("--gamma", default="1.0") parser.add_argument("--k-latent", default="6") # other arguments used in factorization: outdir, nodelist, data # evaluation # (no additional arguments) args = parser.parse_args() job_graph = nx.DiGraph() job_id = 0 # prepare networks nodelist_fp = os.path.join(args.outdir, 'nodelist.txt') string_kegg_union_fp = os.path.join(args.outdir, 'string_kegg_union.graphml') manifold_fps = [] with open(args.manifolds_file_true) as fh: for line in fh: line = line.rstrip() manifold_fps.append(line) attrs = { 'exe': 'prepare_nodelist.py', 'args': ['--stringdb', args.stringdb, '--graphmls'] + manifold_fps + [ '--out-nodelist', nodelist_fp, '--out-graph', string_kegg_union_fp, '--node-attribute', 'name' ], 'out': os.path.join(args.outdir, 'prepare_nodelist.out'), 'err': os.path.join(args.outdir, 'prepare_nodelist.err') } job_graph.add_node(job_id, attrs) job_id += 1 # simulation attrs = { 'exe': "simulate_screens.py", 'args': ["--seed-lists"] + args.seed_lists + [ "--n-gene-lists", str(args.n_gene_lists), "--nodelist", nodelist_fp, "--outdir", args.outdir, '--simulator', args.simulator, '--noise-pr', args.noise_pr ], 'out': os.path.join(args.outdir, "simulate_screens.out"), 'err': os.path.join(args.outdir, "simulate_screens.err") } if (args.rng_seed is not None): attrs['args'] += ['--rng-seed', args.rng_seed] job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id - 1, job_id) job_id += 1 sim_list_fps = [] for i in range(args.n_gene_lists): sim_list_fps.append( os.path.join(args.outdir, "sim_list_{}.txt".format(i + 1))) chosen_seeds_fp = os.path.join(args.outdir, "chosen_seeds.txt") # diffusion diffused_fp = os.path.join(args.outdir, "diffused.csv") attrs = { 'exe': "diffusion.py", 'args': [ "--network", string_kegg_union_fp, "--nodelist", nodelist_fp, "--gene-lists" ] + sim_list_fps + ["--diffused", diffused_fp], 'out': os.path.join(args.outdir, "diffusion.out"), 'err': os.path.join(args.outdir, "diffusion.err") } diffusion_job_id = job_id job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id - 1, job_id) job_id += 1 # factorization # two branches: nmf_pathway on true pathways and nmf_pathway on randomized pathways # 1) nmf_pathway on true - {{ nmf_pathway_true_outdir = os.path.join(args.outdir, "nmf_pathway_true") os.mkdir(nmf_pathway_true_outdir) attrs = { 'exe': "nmf_pathway.py", 'args': [ "--data", diffused_fp, '--k-latent', args.k_latent, "--manifolds-file", args.manifolds_file_true, "--nodelist", nodelist_fp, "--node-attribute", "name", "--gamma", args.gamma, "--outdir", nmf_pathway_true_outdir ], 'out': os.path.join(nmf_pathway_true_outdir, "nmf_pathway.out"), 'err': os.path.join(nmf_pathway_true_outdir, "nmf_pathway.err") } if (args.rng_seed is not None): attrs['args'] += ['--seed', args.rng_seed] prmf_true_gene_by_latent_fp = os.path.join(nmf_pathway_true_outdir, "V.csv") prmf_true_job_id = job_id job_graph.add_node(prmf_true_job_id, attrs) job_graph.add_edge(diffusion_job_id, job_id) job_id += 1 # evaluation attrs = { 'exe': "evaluate_screen_sim.py", 'args': [ "--gene-by-latent", prmf_true_gene_by_latent_fp, "--nodelist", nodelist_fp, "--true-seeds", chosen_seeds_fp ], 'out': os.path.join(nmf_pathway_true_outdir, "evaluate.out"), 'err': os.path.join(nmf_pathway_true_outdir, "evaluate.err") } job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id - 1, job_id) job_id += 1 # }} - nmf_pathway # 2) nmf_pathway on random - {{ nmf_pathway_random_outdir = os.path.join(args.outdir, "nmf_pathway_random") os.mkdir(nmf_pathway_random_outdir) attrs = { 'exe': "nmf_pathway.py", 'args': [ "--data", diffused_fp, '--k-latent', args.k_latent, "--manifolds-file", args.manifolds_file_random, "--nodelist", nodelist_fp, "--node-attribute", "name", "--gamma", args.gamma, "--outdir", nmf_pathway_random_outdir ], 'out': os.path.join(nmf_pathway_random_outdir, "nmf_pathway.out"), 'err': os.path.join(nmf_pathway_random_outdir, "nmf_pathway.err") } if (args.rng_seed is not None): attrs['args'] += ['--seed', args.rng_seed] prmf_random_gene_by_latent_fp = os.path.join(nmf_pathway_random_outdir, "V.csv") prmf_random_job_id = job_id job_graph.add_node(prmf_random_job_id, attrs) job_graph.add_edge(diffusion_job_id, job_id) job_id += 1 # evaluation attrs = { 'exe': "evaluate_screen_sim.py", 'args': [ "--gene-by-latent", prmf_random_gene_by_latent_fp, "--nodelist", nodelist_fp, "--true-seeds", chosen_seeds_fp ], 'out': os.path.join(nmf_pathway_random_outdir, "evaluate.out"), 'err': os.path.join(nmf_pathway_random_outdir, "evaluate.err") } job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id - 1, job_id) job_id += 1 # }} - # plot # TODO rework name of arguments plot_outdir = os.path.join(args.outdir, 'pr_curves') os.mkdir(plot_outdir) attrs = { 'exe': 'plot_pr_curve.py', 'args': [ '--gene-by-latent-csvs', prmf_random_gene_by_latent_fp, prmf_true_gene_by_latent_fp, '--labels', 'PRMF_random', 'PRMF_true', '--nodelist', nodelist_fp, '--true-seeds', chosen_seeds_fp, '--outdir', plot_outdir ], 'out': os.path.join(plot_outdir, 'plot.out'), 'err': os.path.join(plot_outdir, 'plot.err') } job_graph.add_node(job_id, attrs) # run after nmf and nmf_pathway job_graph.add_edge(prmf_true_job_id, job_id) job_graph.add_edge(prmf_random_job_id, job_id) job_id += 1 condor = False if args.condor: condor = True job_ids = script_utils.run_digraph(args.outdir, job_graph, condor=condor)
'out': os.path.join(NMF_outdir, 'evaluate_corr_match.out'), 'err': os.path.join(NMF_outdir, 'evaluate_corr_match.err') } NMF_eval_job_id = job_id eval_job_ids.append(NMF_eval_job_id) job_graph.add_node(NMF_eval_job_id, attrs) job_graph.add_edge(NMF_job_id, NMF_eval_job_id) job_id += 1 attrs = { 'exe': 'plot_mse.py', 'args': ['-i', args.outdir, '-o', args.outdir], 'out': os.path.join(args.outdir, 'plot_mse.out'), 'err': os.path.join(args.outdir, 'plot_mse.err') } plot_job_id = job_id job_graph.add_node(plot_job_id, attrs) for eval_job_id in eval_job_ids: job_graph.add_edge(eval_job_id, plot_job_id) job_id += 1 condor = False if args.condor: condor = True job_ids = script_utils.run_digraph(args.outdir, job_graph, exit_on_err=False, condor=condor)
def main(): parser = argparse.ArgumentParser(description=""" Generative simulation pipeline to benchmark PRMF against NMF, PLIER, CoGAPS, NBS """) parser.add_argument("--n-simulations", "-n", default=2, type=int, help='Number of simulations to run: default 2') parser.add_argument("--outdir", help='Directory to write results to', required=True) parser.add_argument("--seed", help="Seed for random number generators", default=None) parser.add_argument("--condor", action='store_true', help="Run the pipeline on HTCondor") parser.add_argument("--dry-run", action='store_true', help="Report which commands will be run but don't actually run then") parser.add_argument("--do-nmf", default=True, help="If true, include NMF in the benchmarking; default true", type=script_utils.str2bool) parser.add_argument("--do-prmf", default=True, help="If true, include PRMF in the benchmarking; default true", type=script_utils.str2bool) parser.add_argument("--do-plier", default=True, help="If true, include PLIER in the benchmarking; default true", type=script_utils.str2bool) args = parser.parse_args() prmf.script_utils.log_script(sys.argv) job_graph = nx.DiGraph() job_id = 0 # TODO update use of seed nmf_job_ids = [] prmf_job_ids = [] plier_job_ids = [] for i in range(args.n_simulations): outdir = os.path.join(args.outdir, 'sim{}'.format(i)) script_utils.mkdir_p(outdir) attrs = { 'exe': 'gen_sim.py', 'args': ['--outdir', outdir], 'out': os.path.join(outdir, 'gen_sim.out'), 'err': os.path.join(outdir, 'gen_sim.err'), 'env': 'prmf' } if args.seed is not None: attrs['args'] += ['--seed', args.seed] data = os.path.join(outdir, 'X.csv') sim_sample_by_latent = os.path.join(outdir, 'U.csv') sim_gene_by_latent = os.path.join(outdir, 'V.csv') pathways = list(map(lambda x: os.path.join(outdir, 'pathway{}.graphml'.format(x)), range(1000))) pathways_file = os.path.join(outdir, 'pathways_file.txt') job_graph.add_node(job_id, attrs) sim_job_id = job_id job_id += 1 nmf_job_id = None if args.do_nmf: nmf_outdir = os.path.join(outdir, 'nmf') script_utils.mkdir_p(nmf_outdir) # NMF attrs = { 'exe': 'nmf.py', 'args': ['--data', data, '--outdir', nmf_outdir, '--k-latent', '30'], 'out': os.path.join(nmf_outdir, 'nmf.out'), 'err': os.path.join(nmf_outdir, 'nmf.err'), 'env': 'prmf' } if args.seed is not None: attrs['args'] += ['--seed', args.seed] nmf_gene_by_latent = os.path.join(nmf_outdir, 'V.csv') job_graph.add_node(job_id, attrs) job_graph.add_edge(sim_job_id, job_id) nmf_job_id = job_id nmf_job_ids.append(nmf_job_id) job_id += 1 prmf_job_id = None if args.do_prmf: prmf_outdir = os.path.join(outdir, 'prmf') script_utils.mkdir_p(prmf_outdir) # PRMF attrs = { 'exe': 'prmf_runner.py', 'args': ['--data', data, '--manifolds'] + pathways + ['--node-attribute', 'name', '--k-latent', '30', '--outdir', prmf_outdir], 'out': os.path.join(prmf_outdir, 'prmf.out'), 'err': os.path.join(prmf_outdir, 'prmf.err'), 'env': 'prmf' } job_graph.add_node(job_id, attrs) job_graph.add_edge(sim_job_id, job_id) prmf_job_id = job_id prmf_job_ids.append(prmf_job_id) job_id += 1 plier_job_id = None if args.do_plier: plier_outdir = os.path.join(outdir, 'plier') script_utils.mkdir_p(plier_outdir) attrs = { 'exe': 'PLIER_wrapper.R', 'args': ['--data', data, '--pathways-file', pathways_file, '--k-latent', '30', '--node-attribute', 'name', '--L1', '50', '--L2', '50', '--outdir', plier_outdir], 'out': os.path.join(plier_outdir, 'PLIER_wrapper.out'), 'err': os.path.join(plier_outdir, 'PLIER_wrapper.err'), 'env': 'prmf' } job_graph.add_node(job_id, attrs) job_graph.add_edge(sim_job_id, job_id) plier_job_id = job_id plier_job_ids.append(plier_job_id) job_id += 1 # evaluation eval_outdir = os.path.join(args.outdir, 'eval') script_utils.mkdir_p(eval_outdir) attrs = { 'exe': 'gen_sim_eval.py', 'args': ['--indir', args.outdir, '--outdir', eval_outdir], 'out': os.path.join(eval_outdir, 'gen_sim_eval.out'), 'err': os.path.join(eval_outdir, 'gen_sim_eval.err'), 'env': 'prmf' } job_graph.add_node(job_id, attrs) eval_job_id = job_id for nmf_job_id in nmf_job_ids: job_graph.add_edge(nmf_job_id, eval_job_id) for prmf_job_id in prmf_job_ids: job_graph.add_edge(prmf_job_id, eval_job_id) for plier_job_id in plier_job_ids: job_graph.add_edge(plier_job_id, eval_job_id) job_id += 1 condor = False if args.condor: condor = True job_ids = script_utils.run_digraph(args.outdir, job_graph, condor=condor, dry_run=args.dry_run)
def main(): parser = argparse.ArgumentParser(description= """ Evalute nmf_pathway.py by simulating gene lists """) parser.add_argument("--rng-seed", help="Seed for random number generators", default=None) # simulation parser.add_argument("--n-gene-lists", help="Number of gene lists to simulate", type=int, default=6) parser.add_argument("--nodelist", help="Universe of node identifiers and an ordering on those identifiers", required=True) parser.add_argument("--seed-lists", required=True, nargs='+') # TODO future versions manifolds should be distinct from node lists derived from pathways parser.add_argument("--outdir", required=True) parser.add_argument("--simulator") # diffusion parser.add_argument("--network", required=True) #parser.add_argument("--nodelist") # factorization #parser.add_argument("--data", required=True) parser.add_argument("--manifolds", required=True, nargs='+') #parser.add_argument("--outdir") #parser.add_argument("--nodelist") # evaluation # args = parser.parse_args() job_graph = nx.DiGraph() job_id = 0 # simulation attrs = { 'exe': "simulate_screens.py", 'args': ["--seed-lists"] + args.seed_lists + ["--n-gene-lists", str(args.n_gene_lists), "--nodelist", args.nodelist, "--outdir", args.outdir, '--simulator', args.simulator], 'out': os.path.join(args.outdir, "simulate_screens.out"), 'err': os.path.join(args.outdir, "simulate_screens.err") } if(args.rng_seed is not None): attrs['args'] += ['--rng-seed', args.rng_seed] job_graph.add_node(job_id, attrs) job_id += 1 sim_list_fps = [] for i in range(args.n_gene_lists): sim_list_fps.append(os.path.join(args.outdir, "sim_list_{}.txt".format(i+1))) chosen_seeds_fp = os.path.join(args.outdir, "chosen_seeds.txt") # diffusion diffused_fp = os.path.join(args.outdir, "diffused.csv") attrs = { 'exe': "diffusion.py", 'args': ["--network", args.network, "--nodelist", args.nodelist, "--gene-lists"] + sim_list_fps + ["--diffused", diffused_fp], 'out': os.path.join(args.outdir, "diffusion.out"), 'err': os.path.join(args.outdir, "diffusion.err") } job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id-1, job_id) job_id += 1 # factorization attrs = { 'exe': "nmf_pathway.py", 'args': ["--data", diffused_fp, "--manifolds"] + args.manifolds + ["--nodelist", args.nodelist, "--outdir", args.outdir], 'out': os.path.join(args.outdir, "nmf_pathway.out"), 'err': os.path.join(args.outdir, "nmf_pathway.err") } if(args.rng_seed is not None): attrs['args'] += ['--seed', args.rng_seed] gene_by_latent_fp = os.path.join(args.outdir, "V.csv") job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id-1, job_id) job_id += 1 # evaluation # TODO new version attrs = { 'exe': "evaluate_screen_sim.py", 'args': ["--gene-by-latent", gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp], 'out': os.path.join(args.outdir, "evaluate.out"), 'err': os.path.join(args.outdir, "evaluate.err") } job_graph.add_node(job_id, attrs) job_graph.add_edge(job_id-1, job_id) job_id += 1 condor = False job_ids = script_utils.run_digraph(args.outdir, job_graph, condor=condor)