Exemple #1
0
def main():
  parser = argparse.ArgumentParser(description="""
Run nmf_pathway.py with different _r_andom _r_estarts _a_nd _e_dges.

Pass through arguments to nmf_pathway.py except for --condor, --manifolds-init, --n-runs, and --outdir

TODO
----
Capture --manifolds and --manifolds-file and randomize the edges before passing to nmf_pathway.py
""")
  parser.add_argument("--n-runs", type=int, default=2, help="Number of random restarts")
  parser.add_argument("--condor", action='store_true', help='If flag is provided, submit jobs to Condor rather than running them on this machine')
  prmf_args.add_prmf_arguments(parser)
  args = parser.parse_args()

  job_graph = nx.DiGraph()
  job_id = 0
  args_dict = vars(args)
  outdir = args_dict.pop('outdir')
  condor = args_dict.pop('condor')


  # TODO what if manifolds-file is provided instead?
  manifolds = args_dict.pop('manifolds')
  args_dict.pop('manifolds_file')

  args_dict.pop('manifolds_init')
  n_runs = args_dict.pop('n_runs')
  for i in range(n_runs):
    run_outdir = os.path.join(outdir, "run{}".format(i))
    os.mkdir(run_outdir)

    random_pathway_dir = os.path.join(run_outdir, 'random_pathways')
    os.mkdir(random_pathway_dir)
    attrs = {
      'exe': 'randomize_network.R',
      'args': ['--infiles'] + manifolds + ['--outdir', random_pathway_dir],
      'out': os.path.join(random_pathway_dir, 'randomize_network.out'),
      'err': os.path.join(random_pathway_dir, 'randomize_network.err')
    }
    randomize_network_job_id = job_id
    job_graph.add_node(randomize_network_job_id, attrs)
    job_id += 1
    pathways_file = os.path.join(random_pathway_dir, 'pathways_file.txt')

    args_list = script_utils.args_to_list(args_dict)
    args_list = args_list + ['--outdir', run_outdir, '--manifolds-init', '--manifolds-file', pathways_file]
    attrs = {
      'exe': 'nmf_pathway.py',
      'args': args_list,
      'out': os.path.join(run_outdir, 'nmf_pathway.out'),
      'err': os.path.join(run_outdir, 'nmf_pathway.err')
    }
    nmf_pathway_job_id = job_id
    job_graph.add_node(nmf_pathway_job_id, attrs)
    job_graph.add_edge(randomize_network_job_id, nmf_pathway_job_id)
    job_id += 1
  job_ids = script_utils.run_digraph(outdir, job_graph, condor=condor)
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(description="""
Run nmf_pathway.py with different _r_andom _r_estarts.

Pass through arguments to nmf_pathway.py except for --condor, --manifolds-init, --n-runs, and --outdir
""")
    parser.add_argument("--n-runs",
                        type=int,
                        default=2,
                        help="Number of random restarts")
    parser.add_argument(
        "--condor",
        action='store_true',
        help=
        'If flag is provided, submit jobs to Condor rather than running them on this machine'
    )
    prmf_args.add_prmf_arguments(parser)
    args = parser.parse_args()

    job_graph = nx.DiGraph()
    job_id = 0
    args_dict = vars(args)
    outdir = args_dict.pop('outdir')
    condor = args_dict.pop('condor')
    args_dict.pop('manifolds_init')
    n_runs = args_dict.pop('n_runs')
    for i in range(n_runs):
        run_outdir = os.path.join(outdir, "run{}".format(i))
        args_list = script_utils.args_to_list(args_dict)
        args_list = args_list + ['--outdir', run_outdir, '--manifolds-init']
        os.mkdir(run_outdir)
        attrs = {
            'exe': 'nmf_pathway.py',
            'args': args_list,
            'out': os.path.join(run_outdir, 'nmf_pathway.out'),
            'err': os.path.join(run_outdir, 'nmf_pathway.err')
        }
        job_graph.add_node(job_id, attrs)
        job_id += 1
    job_ids = script_utils.run_digraph(outdir, job_graph, condor=condor)
def main():
    parser = argparse.ArgumentParser(description="""
Evalute nmf_pathway.py using true pathways against randomized pathways for a "real" dataset instead of a simulated one
""")
    parser.add_argument("--rng-seed",
                        help="Seed for random number generators",
                        default=None)
    parser.add_argument(
        "--condor",
        action='store_true',
        help="Flag which indicates we should submit jobs to Condor")

    # prepare data
    parser.add_argument("--data", help="CSV", required=True)
    parser.add_argument(
        "--nodelist",
        help="Contains node identifiers for data and manifolds",
        required=True)

    # randomization
    # (run kegg.R then run randomize_network.R then prepare two separate files manifolds_file_true
    # and manifolds_file_random which contain filepaths to some of the kegg and some of randomized kegg, respectively)
    # TODO trouble is the selection of a subset of networks into a file and that randomize_network.R runs on directories
    # could be solved by specifying selection with KEGG pathway identifiers and a directory rather than by a filepath

    # factorization
    parser.add_argument("--manifolds-file-true", required=True)
    parser.add_argument("--manifolds-file-random", required=True)
    parser.add_argument("--gamma", default="1.0")
    parser.add_argument("--k-latent", default="6")
    # other arguments used in factorization: outdir, nodelist, data

    # evaluation
    # (no additional arguments)
    args = parser.parse_args()

    job_graph = nx.DiGraph()
    job_id = 0

    # factorization
    # two branches: nmf_pathway on true pathways and nmf_pathway on randomized pathways
    # 1) nmf_pathway on true - {{
    nmf_pathway_true_outdir = os.path.join(args.outdir, "nmf_pathway_true")
    os.mkdir(nmf_pathway_true_outdir)
    attrs = {
        'exe':
        "nmf_pathway.py",
        'args': [
            "--data", args.data, '--k-latent', args.k_latent,
            "--manifolds-file", args.manifolds_file_true, "--nodelist",
            args.nodelist, "--node-attribute", "name", "--gamma", args.gamma,
            "--outdir", nmf_pathway_true_outdir
        ],
        'out':
        os.path.join(nmf_pathway_true_outdir, "nmf_pathway.out"),
        'err':
        os.path.join(nmf_pathway_true_outdir, "nmf_pathway.err")
    }
    if (args.rng_seed is not None):
        attrs['args'] += ['--seed', args.rng_seed]
    prmf_true_gene_by_latent_fp = os.path.join(nmf_pathway_true_outdir,
                                               "V.csv")
    prmf_true_job_id = job_id
    job_graph.add_node(prmf_true_job_id, attrs)
    job_graph.add_edge(diffusion_job_id, job_id)
    job_id += 1

    # evaluation
    attrs = {
        'exe':
        "evaluate_screen_sim.py",
        'args': [
            "--gene-by-latent", prmf_true_gene_by_latent_fp, "--nodelist",
            nodelist_fp, "--true-seeds", chosen_seeds_fp
        ],
        'out':
        os.path.join(nmf_pathway_true_outdir, "evaluate.out"),
        'err':
        os.path.join(nmf_pathway_true_outdir, "evaluate.err")
    }
    job_graph.add_node(job_id, attrs)
    job_graph.add_edge(job_id - 1, job_id)
    job_id += 1
    # }} - nmf_pathway

    # 2) nmf_pathway on random - {{
    nmf_pathway_random_outdir = os.path.join(args.outdir, "nmf_pathway_random")
    os.mkdir(nmf_pathway_random_outdir)
    attrs = {
        'exe':
        "nmf_pathway.py",
        'args': [
            "--data", diffused_fp, '--k-latent', args.k_latent,
            "--manifolds-file", args.manifolds_file_random, "--nodelist",
            nodelist_fp, "--node-attribute", "name", "--gamma", args.gamma,
            "--outdir", nmf_pathway_random_outdir
        ],
        'out':
        os.path.join(nmf_pathway_random_outdir, "nmf_pathway.out"),
        'err':
        os.path.join(nmf_pathway_random_outdir, "nmf_pathway.err")
    }
    if (args.rng_seed is not None):
        attrs['args'] += ['--seed', args.rng_seed]
    prmf_random_gene_by_latent_fp = os.path.join(nmf_pathway_random_outdir,
                                                 "V.csv")
    prmf_random_job_id = job_id
    job_graph.add_node(prmf_random_job_id, attrs)
    job_graph.add_edge(diffusion_job_id, job_id)
    job_id += 1

    # evaluation
    attrs = {
        'exe':
        "evaluate_screen_sim.py",
        'args': [
            "--gene-by-latent", prmf_random_gene_by_latent_fp, "--nodelist",
            nodelist_fp, "--true-seeds", chosen_seeds_fp
        ],
        'out':
        os.path.join(nmf_pathway_random_outdir, "evaluate.out"),
        'err':
        os.path.join(nmf_pathway_random_outdir, "evaluate.err")
    }
    job_graph.add_node(job_id, attrs)
    job_graph.add_edge(job_id - 1, job_id)
    job_id += 1
    # }} -

    # plot
    # TODO rework name of arguments
    plot_outdir = os.path.join(args.outdir, 'pr_curves')
    os.mkdir(plot_outdir)
    attrs = {
        'exe':
        'plot_pr_curve.py',
        'args': [
            '--gene-by-latent-csvs', prmf_random_gene_by_latent_fp,
            prmf_true_gene_by_latent_fp, '--labels', 'PRMF_random',
            'PRMF_real', '--nodelist', nodelist_fp, '--true-seeds',
            chosen_seeds_fp, '--outdir', plot_outdir
        ],
        'out':
        os.path.join(plot_outdir, 'plot.out'),
        'err':
        os.path.join(plot_outdir, 'plot.err')
    }
    job_graph.add_node(job_id, attrs)
    # run after nmf and nmf_pathway
    job_graph.add_edge(prmf_true_job_id, job_id)
    job_graph.add_edge(prmf_random_job_id, job_id)
    job_id += 1

    condor = False
    if args.condor:
        condor = True
    job_ids = script_utils.run_digraph(args.outdir, job_graph, condor=condor)
def main():
  parser = argparse.ArgumentParser(description=
"""
Evalute nmf_pathway.py by simulating gene lists and compare against nmf_init.py
""")
  # run environment
  parser.add_argument("--rng-seed", help="Seed for random number generators", default=None)
  parser.add_argument("--condor", action='store_true')
  parser.add_argument("--dry-run", action='store_true')
  parser.add_argument("--do-nmf", default=True, type=script_utils.str2bool)
  parser.add_argument("--do-plier", default=True, type=script_utils.str2bool)
  parser.add_argument("--do-nbs", default=True, type=script_utils.str2bool)
  parser.add_argument("--do-cogaps", default=True, type=script_utils.str2bool)

  # simulation
  parser.add_argument("--n-gene-lists", help="Number of gene lists to simulate", type=int, default=6)
  parser.add_argument("--nodelist", help="Universe of node identifiers and an ordering on those identifiers", required=True)
  parser.add_argument("--seed-lists", required=True, nargs='+') # TODO future versions manifolds should be distinct from node lists derived from pathways
  parser.add_argument("--outdir", required=True)
  parser.add_argument("--simulator")
  parser.add_argument("--noise-pr", default="0.05")

  # diffusion
  parser.add_argument("--network", required=True)
  # other arguments used in diffusion: nodelist

  # factorization
  parser.add_argument("--cross-validation", help="Fraction of data in [0,1] to hold out and test reconstruction performance on")
  parser.add_argument("--manifolds-file", required=True)
  parser.add_argument("--gamma", default="1.0")
  parser.add_argument("--k-latent", default="6")
  # other arguments used in factorization: outdir, nodelist, data

  # evaluation
  # (no additional arguments)
  args = parser.parse_args()

  job_graph = nx.DiGraph()
  job_id = 0

  # simulation
  attrs = {
    'exe': "simulate_screens.py",
    'args': ["--seed-lists"] + args.seed_lists + ["--n-gene-lists", str(args.n_gene_lists), "--nodelist", args.nodelist, "--outdir", args.outdir, '--simulator', args.simulator, '--noise-pr', args.noise_pr],
    'out': os.path.join(args.outdir, "simulate_screens.out"),
    'err': os.path.join(args.outdir, "simulate_screens.err"),
    'env': 'prmf'
  }
  if(args.rng_seed is not None):
    attrs['args'] += ['--rng-seed', args.rng_seed]
  job_graph.add_node(job_id, attrs)
  simulation_job_id = job_id
  job_id += 1
  sim_list_fps = []
  for i in range(args.n_gene_lists):
    sim_list_fps.append(os.path.join(args.outdir, "sim_list_{}.txt".format(i+1)))
  chosen_seeds_fp = os.path.join(args.outdir, "chosen_seeds.txt")

  # diffusion
  diffused_fp = os.path.join(args.outdir, "diffused.csv")
  attrs = {
    'exe': "diffusion.py",
    'args': ["--network", args.network, "--nodelist", args.nodelist, "--gene-lists"] + sim_list_fps +  ["--diffused", diffused_fp],
    'out': os.path.join(args.outdir, "diffusion.out"),
    'err': os.path.join(args.outdir, "diffusion.err"),
    'env': 'prmf'
  }
  diffusion_job_id = job_id
  job_graph.add_node(job_id, attrs)
  job_graph.add_edge(job_id-1, job_id)
  job_id += 1

  # factorization
  # two branches: nmf and nmf_pathway
  # 1) nmf - {{
  if args.do_nmf:
    nmf_outdir = os.path.join(args.outdir, "nmf")
    script_utils.mkdir_p(nmf_outdir)
    attrs = {
      'exe': "nmf.py",
      'args': ['--data', diffused_fp, '--k-latent', args.k_latent, '--outdir', nmf_outdir],
      'out': os.path.join(nmf_outdir, 'nmf.out'),
      'err': os.path.join(nmf_outdir, 'nmf.err'),
      'env': 'prmf'
    }
    if(args.rng_seed is not None):
      attrs['args'] += ['--seed', args.rng_seed]
    if(args.cross_validation is not None):
      attrs['args'] += ['--cross-validation', args.cross_validation]
    nmf_gene_by_latent_fp = os.path.join(nmf_outdir, "V.csv")
    nmf_job_id = job_id
    job_graph.add_node(nmf_job_id, attrs)
    job_graph.add_edge(diffusion_job_id, nmf_job_id)
    job_id += 1

    attrs = {
      'exe': "evaluate_screen_sim.py",
      'args': ["--gene-by-latent", nmf_gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp],
      'out': os.path.join(nmf_outdir, "evaluate.out"),
      'err': os.path.join(nmf_outdir, "evaluate.err"),
      'env': 'prmf'
    }
    job_graph.add_node(job_id, attrs)
    job_graph.add_edge(job_id-1, job_id)
    job_id += 1
  # }} - nmf

  # 2) nmf_pathway - {{
  nmf_pathway_outdir = os.path.join(args.outdir, "nmf_pathway")
  script_utils.mkdir_p(nmf_pathway_outdir)
  attrs = {
    'exe': "nmf_pathway.py",
    'args': ["--data", diffused_fp, '--k-latent', args.k_latent, "--manifolds-file", args.manifolds_file, "--node-attribute", "name", "--nodelist", args.nodelist, "--gamma", args.gamma, "--outdir", nmf_pathway_outdir],
    'out': os.path.join(nmf_pathway_outdir, "nmf_pathway.out"),
    'err': os.path.join(nmf_pathway_outdir, "nmf_pathway.err"),
    'env': 'prmf'
  }
  if(args.rng_seed is not None):
    attrs['args'] += ['--seed', args.rng_seed]
  if(args.cross_validation is not None):
    attrs['args'] += ['--cross-validation', args.cross_validation]
  prmf_gene_by_latent_fp = os.path.join(nmf_pathway_outdir, "V.csv")
  prmf_job_id = job_id
  job_graph.add_node(prmf_job_id, attrs)
  job_graph.add_edge(diffusion_job_id, job_id)
  job_id += 1

  # evaluation
  attrs = {
    'exe': "evaluate_screen_sim.py",
    'args': ["--gene-by-latent", prmf_gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp],
    'out': os.path.join(nmf_pathway_outdir, "evaluate.out"),
    'err': os.path.join(nmf_pathway_outdir, "evaluate.err"),
    'env': 'prmf'
  }
  job_graph.add_node(job_id, attrs)
  job_graph.add_edge(job_id-1, job_id)
  job_id += 1
  # }} - nmf_pathway


  # 3) PLIER - {{
  if args.do_plier:
    PLIER_outdir = os.path.join(args.outdir, "PLIER")
    script_utils.mkdir_p(PLIER_outdir)
    attrs = {
      'exe': 'PLIER_wrapper.R',
      'args': ['--data', diffused_fp, '--nodelist', args.nodelist, '--k-latent', args.k_latent, '--pathways-file', args.manifolds_file, '--node-attribute', 'name', '--outdir', PLIER_outdir],
      'out': os.path.join(PLIER_outdir, "PLIER_wrapper.out"),
      'err': os.path.join(PLIER_outdir, "PLIER_wrapper.err"),
      'env': 'PLIER'
    }
    if(args.rng_seed is not None):
      attrs['args'] += ['--seed', args.rng_seed]
    PLIER_gene_by_latent_fp = os.path.join(PLIER_outdir, "Z.csv")
    PLIER_job_id = job_id
    job_graph.add_node(PLIER_job_id, attrs)
    job_graph.add_edge(diffusion_job_id, PLIER_job_id)
    job_id += 1

    # evaluation
    attrs = {
      'exe': "evaluate_screen_sim.py",
      'args': ["--gene-by-latent", PLIER_gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp],
      'out': os.path.join(PLIER_outdir, "evaluate.out"),
      'err': os.path.join(PLIER_outdir, "evaluate.err"),
      'env': 'prmf'
    }
    job_graph.add_node(job_id, attrs)
    job_graph.add_edge(job_id-1, job_id)
    job_id += 1
  # }} - PLIER

  # 4) NBS - {{
  # NOTE NBS does its own diffusion based on binary somatic mutation profiles so we pass the simulated hits rather than our diffused data
  if args.do_nbs:
    NBS_outdir = os.path.join(args.outdir, 'NBS')
    script_utils.mkdir_p(NBS_outdir)
    attrs = {
      'exe': 'pyNBS_wrapper.py',
      'args': ['--nodelist', args.nodelist, '--gene-lists'] + sim_list_fps + ['--network', args.network, '--k-latent', args.k_latent, '--outdir', NBS_outdir],
      'out': os.path.join(NBS_outdir, 'pyNBS_wrapper.out'),
      'err': os.path.join(NBS_outdir, 'pyNBS_wrapper.err'),
      'env': 'pyNBS'
    }
    NBS_job_id = job_id
    NBS_gene_by_latent_fp = os.path.join(NBS_outdir, "W.csv")
    job_graph.add_node(NBS_job_id, attrs)
    job_graph.add_edge(simulation_job_id, NBS_job_id)
    job_id += 1

    # evaluation
    attrs = {
      'exe': "evaluate_screen_sim.py",
      'args': ["--gene-by-latent", NBS_gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp],
      'out': os.path.join(NBS_outdir, "evaluate.out"),
      'err': os.path.join(NBS_outdir, "evaluate.err"),
      'env': 'prmf'
    }
    job_graph.add_node(job_id, attrs)
    job_graph.add_edge(NBS_job_id, job_id)
    job_id += 1
  # }} - NBS

  # 5) CoGAPS - {{
  if args.do_cogaps:
    CoGAPS_outdir = os.path.join(args.outdir, 'CoGAPS')
    script_utils.mkdir_p(CoGAPS_outdir)
    attrs = {
      'exe': 'CoGAPS_wrapper.R',
      'args': ['--data', diffused_fp, '--k-latent', args.k_latent, '--outdir', args.outdir],
      'out': os.path.join(CoGAPS_outdir, 'CoGAPS_wrapper.out'),
      'err': os.path.join(CoGAPS_outdir, 'CoGAPS_wrapper.err'),
      'env': 'CoGAPS'
    }
    CoGAPS_job_id = job_id
    CoGAPS_gene_by_latent_fp = os.path.join(CoGAPS_outdir, "P.csv")
    job_graph.add_node(CoGAPS_job_id, attrs)
    job_graph.add_edge(diffusion_job_id, CoGAPS_job_id)
    job_id += 1

    # evaluation
    attrs = {
      'exe': "evaluate_screen_sim.py",
      'args': ["--gene-by-latent", CoGAPS_gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp],
      'out': os.path.join(CoGAPS_outdir, "evaluate.out"),
      'err': os.path.join(CoGAPS_outdir, "evaluate.err"),
      'env': 'prmf'
    }
    job_graph.add_node(job_id, attrs)
    job_graph.add_edge(NBS_job_id, job_id)
    job_id += 1
  # }} - CoGAPS

  # plot
  plot_outdir = os.path.join(args.outdir, 'pr_curves')
  script_utils.mkdir_p(plot_outdir)

  gene_by_latent_csvs = []
  labels = []
  if args.do_nmf:
    labels.append('NMF')
    nmf_gene_by_latent_fp = os.path.join(nmf_outdir, "V.csv")
    gene_by_latent_csvs.append(nmf_gene_by_latent_fp)

  labels.append('PRMF')
  gene_by_latent_csvs.append(prmf_gene_by_latent_fp)

  if args.do_plier:
    labels.append("PLIER")
    PLIER_gene_by_latent_fp = os.path.join(PLIER_outdir, "Z.csv")
    gene_by_latent_csvs.append(PLIER_gene_by_latent_fp)

  if args.do_nbs:
    labels.append("NBS")
    NBS_gene_by_latent_fp = os.path.join(NBS_outdir, "W.csv")
    gene_by_latent_csvs.append(NBS_gene_by_latent_fp)

  if args.do_cogaps:
    labels.append("CoGAPS")
    CoGAPS_gene_by_latent_fp = os.path.join(CoGAPS_outdir, "P.csv")
    gene_by_latent_csvs.append(CoGAPS_gene_by_latent_fp)

  attrs = {
    'exe': 'plot_pr_curve.py',
    'args': [
      '--gene-by-latent-csvs'] + gene_by_latent_csvs +
      ['--labels'] + labels +
      ['--nodelist', args.nodelist, '--true-seeds', chosen_seeds_fp, '--outdir', plot_outdir],
    'out': os.path.join(plot_outdir, 'plot.out'),
    'err': os.path.join(plot_outdir, 'plot.err'),
    'env': 'prmf'
  }
  job_graph.add_node(job_id, attrs)
  # run after all methods
  if args.do_nmf:
    job_graph.add_edge(nmf_job_id, job_id)
  job_graph.add_edge(prmf_job_id, job_id)
  if args.do_plier:
    job_graph.add_edge(PLIER_job_id, job_id)
  if args.do_nbs:
    job_graph.add_edge(NBS_job_id, job_id)
  if args.do_cogaps:
    job_graph.add_edge(CoGAPS_job_id, job_id)
  job_id += 1

  condor = False
  if args.condor:
    condor = True
  job_ids = script_utils.run_digraph(args.outdir, job_graph, condor=condor, dry_run=args.dry_run)
def main():
    parser = argparse.ArgumentParser(description="""
Evalute nmf_pathway.py against true pathways and randomized pathways
""")
    parser.add_argument("--rng-seed",
                        help="Seed for random number generators",
                        default=None)
    parser.add_argument(
        "--condor",
        action='store_true',
        help="Flag which indicates we should submit jobs to Condor")

    # prepare networks
    parser.add_argument(
        '--stringdb',
        help="STRINGdb database file e.g. 9606.protein.links.detailed.v10.txt",
        required=True)

    # simulation
    parser.add_argument("--n-gene-lists",
                        help="Number of gene lists to simulate",
                        type=int,
                        default=6)
    parser.add_argument(
        "--seed-lists", required=True, nargs='+'
    )  # TODO future versions manifolds should be distinct from node lists derived from pathways
    parser.add_argument("--outdir", required=True)
    parser.add_argument("--simulator",
                        default='mixture')  # TODO check option value
    parser.add_argument("--noise-pr", default="0.05")

    # randomization
    # (run kegg.R then run randomize_network.R then prepare two separate files manifolds_file_true
    # and manifolds_file_random which contain filepaths to some of the kegg and some of randomized kegg, respectively)
    # TODO trouble is the selection of a subset of networks into a file and that kegg.R runs on directories
    # could be solved by specifying selection with KEGG pathway identifiers and a directory rather than by a filepath

    # diffusion
    # (no additional arguments)
    # other arguments used in diffusion: nodelist

    # factorization
    parser.add_argument("--manifolds-file-true", required=True)
    parser.add_argument("--manifolds-file-random", required=True)
    parser.add_argument("--gamma", default="1.0")
    parser.add_argument("--k-latent", default="6")
    # other arguments used in factorization: outdir, nodelist, data

    # evaluation
    # (no additional arguments)
    args = parser.parse_args()

    job_graph = nx.DiGraph()
    job_id = 0

    # prepare networks
    nodelist_fp = os.path.join(args.outdir, 'nodelist.txt')
    string_kegg_union_fp = os.path.join(args.outdir,
                                        'string_kegg_union.graphml')
    manifold_fps = []
    with open(args.manifolds_file_true) as fh:
        for line in fh:
            line = line.rstrip()
            manifold_fps.append(line)
    attrs = {
        'exe':
        'prepare_nodelist.py',
        'args': ['--stringdb', args.stringdb, '--graphmls'] + manifold_fps + [
            '--out-nodelist', nodelist_fp, '--out-graph', string_kegg_union_fp,
            '--node-attribute', 'name'
        ],
        'out':
        os.path.join(args.outdir, 'prepare_nodelist.out'),
        'err':
        os.path.join(args.outdir, 'prepare_nodelist.err')
    }
    job_graph.add_node(job_id, attrs)
    job_id += 1

    # simulation
    attrs = {
        'exe':
        "simulate_screens.py",
        'args': ["--seed-lists"] + args.seed_lists + [
            "--n-gene-lists",
            str(args.n_gene_lists), "--nodelist", nodelist_fp, "--outdir",
            args.outdir, '--simulator', args.simulator, '--noise-pr',
            args.noise_pr
        ],
        'out':
        os.path.join(args.outdir, "simulate_screens.out"),
        'err':
        os.path.join(args.outdir, "simulate_screens.err")
    }
    if (args.rng_seed is not None):
        attrs['args'] += ['--rng-seed', args.rng_seed]
    job_graph.add_node(job_id, attrs)
    job_graph.add_edge(job_id - 1, job_id)
    job_id += 1
    sim_list_fps = []
    for i in range(args.n_gene_lists):
        sim_list_fps.append(
            os.path.join(args.outdir, "sim_list_{}.txt".format(i + 1)))
    chosen_seeds_fp = os.path.join(args.outdir, "chosen_seeds.txt")

    # diffusion
    diffused_fp = os.path.join(args.outdir, "diffused.csv")
    attrs = {
        'exe':
        "diffusion.py",
        'args': [
            "--network", string_kegg_union_fp, "--nodelist", nodelist_fp,
            "--gene-lists"
        ] + sim_list_fps + ["--diffused", diffused_fp],
        'out':
        os.path.join(args.outdir, "diffusion.out"),
        'err':
        os.path.join(args.outdir, "diffusion.err")
    }
    diffusion_job_id = job_id
    job_graph.add_node(job_id, attrs)
    job_graph.add_edge(job_id - 1, job_id)
    job_id += 1

    # factorization
    # two branches: nmf_pathway on true pathways and nmf_pathway on randomized pathways
    # 1) nmf_pathway on true - {{
    nmf_pathway_true_outdir = os.path.join(args.outdir, "nmf_pathway_true")
    os.mkdir(nmf_pathway_true_outdir)
    attrs = {
        'exe':
        "nmf_pathway.py",
        'args': [
            "--data", diffused_fp, '--k-latent', args.k_latent,
            "--manifolds-file", args.manifolds_file_true, "--nodelist",
            nodelist_fp, "--node-attribute", "name", "--gamma", args.gamma,
            "--outdir", nmf_pathway_true_outdir
        ],
        'out':
        os.path.join(nmf_pathway_true_outdir, "nmf_pathway.out"),
        'err':
        os.path.join(nmf_pathway_true_outdir, "nmf_pathway.err")
    }
    if (args.rng_seed is not None):
        attrs['args'] += ['--seed', args.rng_seed]
    prmf_true_gene_by_latent_fp = os.path.join(nmf_pathway_true_outdir,
                                               "V.csv")
    prmf_true_job_id = job_id
    job_graph.add_node(prmf_true_job_id, attrs)
    job_graph.add_edge(diffusion_job_id, job_id)
    job_id += 1

    # evaluation
    attrs = {
        'exe':
        "evaluate_screen_sim.py",
        'args': [
            "--gene-by-latent", prmf_true_gene_by_latent_fp, "--nodelist",
            nodelist_fp, "--true-seeds", chosen_seeds_fp
        ],
        'out':
        os.path.join(nmf_pathway_true_outdir, "evaluate.out"),
        'err':
        os.path.join(nmf_pathway_true_outdir, "evaluate.err")
    }
    job_graph.add_node(job_id, attrs)
    job_graph.add_edge(job_id - 1, job_id)
    job_id += 1
    # }} - nmf_pathway

    # 2) nmf_pathway on random - {{
    nmf_pathway_random_outdir = os.path.join(args.outdir, "nmf_pathway_random")
    os.mkdir(nmf_pathway_random_outdir)
    attrs = {
        'exe':
        "nmf_pathway.py",
        'args': [
            "--data", diffused_fp, '--k-latent', args.k_latent,
            "--manifolds-file", args.manifolds_file_random, "--nodelist",
            nodelist_fp, "--node-attribute", "name", "--gamma", args.gamma,
            "--outdir", nmf_pathway_random_outdir
        ],
        'out':
        os.path.join(nmf_pathway_random_outdir, "nmf_pathway.out"),
        'err':
        os.path.join(nmf_pathway_random_outdir, "nmf_pathway.err")
    }
    if (args.rng_seed is not None):
        attrs['args'] += ['--seed', args.rng_seed]
    prmf_random_gene_by_latent_fp = os.path.join(nmf_pathway_random_outdir,
                                                 "V.csv")
    prmf_random_job_id = job_id
    job_graph.add_node(prmf_random_job_id, attrs)
    job_graph.add_edge(diffusion_job_id, job_id)
    job_id += 1

    # evaluation
    attrs = {
        'exe':
        "evaluate_screen_sim.py",
        'args': [
            "--gene-by-latent", prmf_random_gene_by_latent_fp, "--nodelist",
            nodelist_fp, "--true-seeds", chosen_seeds_fp
        ],
        'out':
        os.path.join(nmf_pathway_random_outdir, "evaluate.out"),
        'err':
        os.path.join(nmf_pathway_random_outdir, "evaluate.err")
    }
    job_graph.add_node(job_id, attrs)
    job_graph.add_edge(job_id - 1, job_id)
    job_id += 1
    # }} -

    # plot
    # TODO rework name of arguments
    plot_outdir = os.path.join(args.outdir, 'pr_curves')
    os.mkdir(plot_outdir)
    attrs = {
        'exe':
        'plot_pr_curve.py',
        'args': [
            '--gene-by-latent-csvs', prmf_random_gene_by_latent_fp,
            prmf_true_gene_by_latent_fp, '--labels', 'PRMF_random',
            'PRMF_true', '--nodelist', nodelist_fp, '--true-seeds',
            chosen_seeds_fp, '--outdir', plot_outdir
        ],
        'out':
        os.path.join(plot_outdir, 'plot.out'),
        'err':
        os.path.join(plot_outdir, 'plot.err')
    }
    job_graph.add_node(job_id, attrs)
    # run after nmf and nmf_pathway
    job_graph.add_edge(prmf_true_job_id, job_id)
    job_graph.add_edge(prmf_random_job_id, job_id)
    job_id += 1

    condor = False
    if args.condor:
        condor = True
    job_ids = script_utils.run_digraph(args.outdir, job_graph, condor=condor)
Exemple #6
0
            'out':
            os.path.join(NMF_outdir, 'evaluate_corr_match.out'),
            'err':
            os.path.join(NMF_outdir, 'evaluate_corr_match.err')
        }
        NMF_eval_job_id = job_id
        eval_job_ids.append(NMF_eval_job_id)
        job_graph.add_node(NMF_eval_job_id, attrs)
        job_graph.add_edge(NMF_job_id, NMF_eval_job_id)
        job_id += 1

    attrs = {
        'exe': 'plot_mse.py',
        'args': ['-i', args.outdir, '-o', args.outdir],
        'out': os.path.join(args.outdir, 'plot_mse.out'),
        'err': os.path.join(args.outdir, 'plot_mse.err')
    }
    plot_job_id = job_id
    job_graph.add_node(plot_job_id, attrs)
    for eval_job_id in eval_job_ids:
        job_graph.add_edge(eval_job_id, plot_job_id)
    job_id += 1

    condor = False
    if args.condor:
        condor = True
    job_ids = script_utils.run_digraph(args.outdir,
                                       job_graph,
                                       exit_on_err=False,
                                       condor=condor)
Exemple #7
0
def main():
  parser = argparse.ArgumentParser(description="""
Generative simulation pipeline to benchmark PRMF against NMF, PLIER, CoGAPS, NBS
""")
  parser.add_argument("--n-simulations", "-n", default=2, type=int, help='Number of simulations to run: default 2')
  parser.add_argument("--outdir", help='Directory to write results to', required=True)
  parser.add_argument("--seed", help="Seed for random number generators", default=None)
  parser.add_argument("--condor", action='store_true', help="Run the pipeline on HTCondor")
  parser.add_argument("--dry-run", action='store_true', help="Report which commands will be run but don't actually run then")
  parser.add_argument("--do-nmf", default=True, help="If true, include NMF in the benchmarking; default true", type=script_utils.str2bool)
  parser.add_argument("--do-prmf", default=True, help="If true, include PRMF in the benchmarking; default true", type=script_utils.str2bool)
  parser.add_argument("--do-plier", default=True, help="If true, include PLIER in the benchmarking; default true", type=script_utils.str2bool)
  args = parser.parse_args()
  prmf.script_utils.log_script(sys.argv)

  job_graph = nx.DiGraph()
  job_id = 0

  # TODO update use of seed
  nmf_job_ids = []
  prmf_job_ids = []
  plier_job_ids = []
  for i in range(args.n_simulations):
    outdir = os.path.join(args.outdir, 'sim{}'.format(i))
    script_utils.mkdir_p(outdir) 
    attrs = {
      'exe': 'gen_sim.py',
      'args': ['--outdir', outdir],
      'out': os.path.join(outdir, 'gen_sim.out'),
      'err': os.path.join(outdir, 'gen_sim.err'),
      'env': 'prmf'
    }
    if args.seed is not None:
      attrs['args'] += ['--seed', args.seed]
    data = os.path.join(outdir, 'X.csv')
    sim_sample_by_latent = os.path.join(outdir, 'U.csv')
    sim_gene_by_latent = os.path.join(outdir, 'V.csv')
    pathways = list(map(lambda x: os.path.join(outdir, 'pathway{}.graphml'.format(x)), range(1000)))
    pathways_file = os.path.join(outdir, 'pathways_file.txt')

    job_graph.add_node(job_id, attrs)
    sim_job_id = job_id
    job_id += 1

    nmf_job_id = None
    if args.do_nmf:
      nmf_outdir = os.path.join(outdir, 'nmf')
      script_utils.mkdir_p(nmf_outdir)

      # NMF
      attrs = {
        'exe': 'nmf.py',
        'args': ['--data', data, '--outdir', nmf_outdir, '--k-latent', '30'],
        'out': os.path.join(nmf_outdir, 'nmf.out'),
        'err': os.path.join(nmf_outdir, 'nmf.err'),
        'env': 'prmf'
      }
      if args.seed is not None:
        attrs['args'] += ['--seed', args.seed]
      nmf_gene_by_latent = os.path.join(nmf_outdir, 'V.csv')
      job_graph.add_node(job_id, attrs)
      job_graph.add_edge(sim_job_id, job_id)
      nmf_job_id = job_id
      nmf_job_ids.append(nmf_job_id)
      job_id += 1 

    prmf_job_id = None
    if args.do_prmf:
      prmf_outdir = os.path.join(outdir, 'prmf')
      script_utils.mkdir_p(prmf_outdir)

      # PRMF
      attrs = {
        'exe': 'prmf_runner.py',
        'args': ['--data', data, '--manifolds'] + pathways + ['--node-attribute', 'name', '--k-latent', '30', '--outdir', prmf_outdir],
        'out': os.path.join(prmf_outdir, 'prmf.out'),
        'err': os.path.join(prmf_outdir, 'prmf.err'),
        'env': 'prmf'
      }
      job_graph.add_node(job_id, attrs)
      job_graph.add_edge(sim_job_id, job_id)
      prmf_job_id = job_id
      prmf_job_ids.append(prmf_job_id)
      job_id += 1

    plier_job_id = None
    if args.do_plier:
      plier_outdir = os.path.join(outdir, 'plier')
      script_utils.mkdir_p(plier_outdir)
      attrs = {
        'exe': 'PLIER_wrapper.R',
        'args': ['--data', data, '--pathways-file', pathways_file, '--k-latent', '30', '--node-attribute', 'name', '--L1', '50', '--L2', '50', '--outdir', plier_outdir],
        'out': os.path.join(plier_outdir, 'PLIER_wrapper.out'),
        'err': os.path.join(plier_outdir, 'PLIER_wrapper.err'),
        'env': 'prmf'
      }
      job_graph.add_node(job_id, attrs)
      job_graph.add_edge(sim_job_id, job_id)
      plier_job_id = job_id
      plier_job_ids.append(plier_job_id)
      job_id += 1

  # evaluation
  eval_outdir = os.path.join(args.outdir, 'eval')
  script_utils.mkdir_p(eval_outdir)
  attrs = {
    'exe': 'gen_sim_eval.py',
    'args': ['--indir', args.outdir, '--outdir', eval_outdir],
    'out': os.path.join(eval_outdir, 'gen_sim_eval.out'),
    'err': os.path.join(eval_outdir, 'gen_sim_eval.err'),
    'env': 'prmf'
  }
  job_graph.add_node(job_id, attrs)
  eval_job_id = job_id
  for nmf_job_id in nmf_job_ids:
    job_graph.add_edge(nmf_job_id, eval_job_id)
  for prmf_job_id in prmf_job_ids:
    job_graph.add_edge(prmf_job_id, eval_job_id)
  for plier_job_id in plier_job_ids:
    job_graph.add_edge(plier_job_id, eval_job_id)
  job_id += 1

  condor = False
  if args.condor:
    condor = True
  job_ids = script_utils.run_digraph(args.outdir, job_graph, condor=condor, dry_run=args.dry_run)
Exemple #8
0
def main():
  parser = argparse.ArgumentParser(description=
"""
Evalute nmf_pathway.py by simulating gene lists
""")
  parser.add_argument("--rng-seed", help="Seed for random number generators", default=None)

  # simulation
  parser.add_argument("--n-gene-lists", help="Number of gene lists to simulate", type=int, default=6)
  parser.add_argument("--nodelist", help="Universe of node identifiers and an ordering on those identifiers", required=True)
  parser.add_argument("--seed-lists", required=True, nargs='+') # TODO future versions manifolds should be distinct from node lists derived from pathways
  parser.add_argument("--outdir", required=True)
  parser.add_argument("--simulator")

  # diffusion
  parser.add_argument("--network", required=True)
  #parser.add_argument("--nodelist")

  # factorization
  #parser.add_argument("--data", required=True)
  parser.add_argument("--manifolds", required=True, nargs='+')
  #parser.add_argument("--outdir")
  #parser.add_argument("--nodelist")

  # evaluation
  #
  args = parser.parse_args()

  job_graph = nx.DiGraph()
  job_id = 0

  # simulation
  attrs = {
    'exe': "simulate_screens.py",
    'args': ["--seed-lists"] + args.seed_lists + ["--n-gene-lists", str(args.n_gene_lists), "--nodelist", args.nodelist, "--outdir", args.outdir, '--simulator', args.simulator],
    'out': os.path.join(args.outdir, "simulate_screens.out"),
    'err': os.path.join(args.outdir, "simulate_screens.err")
  }
  if(args.rng_seed is not None):
    attrs['args'] += ['--rng-seed', args.rng_seed]
  job_graph.add_node(job_id, attrs)
  job_id += 1
  sim_list_fps = []
  for i in range(args.n_gene_lists):
    sim_list_fps.append(os.path.join(args.outdir, "sim_list_{}.txt".format(i+1)))
  chosen_seeds_fp = os.path.join(args.outdir, "chosen_seeds.txt")

  # diffusion
  diffused_fp = os.path.join(args.outdir, "diffused.csv")
  attrs = {
    'exe': "diffusion.py",
    'args': ["--network", args.network, "--nodelist", args.nodelist, "--gene-lists"] + sim_list_fps +  ["--diffused", diffused_fp],
    'out': os.path.join(args.outdir, "diffusion.out"),
    'err': os.path.join(args.outdir, "diffusion.err")
  }
  job_graph.add_node(job_id, attrs)
  job_graph.add_edge(job_id-1, job_id)
  job_id += 1

  # factorization
  attrs = {
    'exe': "nmf_pathway.py",
    'args': ["--data", diffused_fp, "--manifolds"] + args.manifolds + ["--nodelist", args.nodelist, "--outdir", args.outdir],
    'out': os.path.join(args.outdir, "nmf_pathway.out"),
    'err': os.path.join(args.outdir, "nmf_pathway.err")
  }
  if(args.rng_seed is not None):
    attrs['args'] += ['--seed', args.rng_seed]
  gene_by_latent_fp = os.path.join(args.outdir, "V.csv")
  job_graph.add_node(job_id, attrs)
  job_graph.add_edge(job_id-1, job_id)
  job_id += 1

  # evaluation
  # TODO new version
  attrs = {
    'exe': "evaluate_screen_sim.py",
    'args': ["--gene-by-latent", gene_by_latent_fp, "--nodelist", args.nodelist, "--true-seeds", chosen_seeds_fp],
    'out': os.path.join(args.outdir, "evaluate.out"),
    'err': os.path.join(args.outdir, "evaluate.err")
  }
  job_graph.add_node(job_id, attrs)
  job_graph.add_edge(job_id-1, job_id)
  job_id += 1

  condor = False
  job_ids = script_utils.run_digraph(args.outdir, job_graph, condor=condor)