Ejemplo n.º 1
0
def main():
    #from optparse import OptionParser, OptionGroup
    from optparse import OptionParser

    usage = "usage: %prog [options] <file1> <file2> ..."
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("-f",
                      "--num_factors",
                      dest="num_factors",
                      type="int",
                      default=15,
                      help="Number of factors to use for normalization.")
    # Any string in the control probe file can be a control probe.
    # Delimited by tabs and newlines.
    parser.add_option("",
                      "--control_probe_file",
                      dest="control_probe_file",
                      default=None,
                      help="File that contains the control probes.")
    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--bfrm",
                      dest="bfrm_path",
                      default=None,
                      help="Specify the path to the BFRM_normalize directory.")
    parser.add_option("",
                      "--matlab",
                      dest="matlab",
                      default="matlab",
                      help="Specify the command to run matlab.")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--povray",
                      dest="povray",
                      default="povray",
                      help="Specify the command to run povray.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    # Parse the arguments.
    options, args = parser.parse_args()

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import this after the library path is set.
    import time
    import arrayio
    from genomicode import filelib
    from genomicode import archive
    from genomicode import genepattern

    start_time = time.time()

    genepattern.fix_environ_path()

    if not args:
        parser.error("Please specify files to normalize.")
    filenames = args
    names = [os.path.split(x)[-1] for x in filenames]
    for filename in filenames:
        assert filelib.exists(filename), "File not found: %s" % filename

    # Check to make sure value for num_factors is reasonable.
    MIN_FACTORS, MAX_FACTORS = 1, 100
    if options.num_factors < MIN_FACTORS:
        if MIN_FACTORS == 1:
            parser.error("At least %d factor is required." % MIN_FACTORS)
        else:
            parser.error("At least %d factors are required." % MIN_FACTORS)
    elif options.num_factors > MAX_FACTORS:
        parser.error("%d factors is too many.  Maximum is %d." %
                     (options.num_factors, MAX_FACTORS))

    # Set up the files.
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read each of the input files and align them.
    matrices = read_matrices(filenames)

    # Make sure the number of factors don't exceed the size of the
    # matrices.
    if matrices and options.num_factors > matrices[0].nrow():
        parser.error("Too many factors.")

    # Standardize each of the matrices to GCT format.
    if 1:  # for debugging
        for i in range(len(matrices)):
            matrices[i] = arrayio.convert(matrices[i],
                                          to_format=arrayio.gct_format)
        write_dataset(file_layout.DS_ORIG, matrices)

    # Log each of the matrices if needed.
    if 1:  # for debugging
        log_matrices(names, matrices)
        write_dataset(file_layout.DS_PROC, matrices)
        sys.stdout.flush()

    # Format the parameters and output files for bfrm.
    if 1:  # for debugging
        run_bfrm(options.bfrm_path, options.num_factors,
                 options.control_probe_file, file_layout, options.matlab)

    # Generate some files for output.
    if 1:  # for debugging
        summarize_dataset(file_layout)
        summarize_filtered_genes(file_layout)
    summarize_heatmaps(options.python, options.arrayplot, options.cluster,
                       file_layout, options.libpath)
    summarize_pca(options.povray, file_layout, matrices)
    summarize_report(filenames, matrices, options.num_factors, start_time,
                     file_layout)

    # Archive the BFRM stuff, and the big files.
    if options.archive:
        print "Archiving results."
        archive.zip_path(file_layout.BFRM, noclobber=False)
        archive.zip_path(file_layout.ATTIC, noclobber=False)
        #archive.zip_path(file_layout.DS_PROC, noclobber=False)
        #archive.zip_path(file_layout.DS_FINAL, noclobber=False)

    print "Done."
Ejemplo n.º 2
0
def main():
    import argparse
    import glob
    import itertools

    DEF_PVALUE = 0.05

    parser = argparse.ArgumentParser(
        description="Score a gene set on a gene expression data set.")

    parser.add_argument("expression_files",
                        nargs="+",
                        help="Data set(s) to score.")
    parser.add_argument("-o",
                        dest="outfile",
                        default=None,
                        help="Name of file for results.")
    parser.add_argument("--transpose",
                        action="store_true",
                        help="Transpose the output matrix.")
    parser.add_argument(
        "--pvalue",
        type=float,
        default=DEF_PVALUE,
        help="p-value cutoff for determining significant changes "
        "(default %g)." % DEF_PVALUE)

    parser.add_argument("--libpath",
                        dest="libpath",
                        action="append",
                        default=[],
                        help="Add to the Python library search path.")
    parser.add_argument("-j",
                        dest="num_procs",
                        type=int,
                        default=1,
                        help="Number of jobs to run in parallel.")

    # Assumes that there are no commas in names of gene sets.
    group = parser.add_argument_group(title="Gene Set")
    group.add_argument(
        "--geneset_file",
        dest="geneset_files",
        action="append",
        default=[],
        help="File(s) with gene sets.  Should be in gmx or gmt format.")
    group.add_argument(
        "-g",
        dest="gene_set",
        action="append",
        default=[],
        help="Name of the gene set to score.  If you want to score both "
        "the positively and negatively correlated genes, specify both "
        "gene sets using the format: <positive_geneset>,<negative_geneset>.  "
        "You can use this option multiple times to score more than one gene "
        "set.")
    group.add_argument("--all",
                       dest="all_gene_sets",
                       action="store_true",
                       default=False,
                       help="Score all gene sets in the files.")
    group.add_argument(
        "--any_matching",
        dest="any_matching_gene_sets",
        action="store_true",
        default=False,
        help="Score gene sets in the files that matches these genes.")
    group.add_argument("--automatch",
                       action="store_true",
                       default=False,
                       help="Will match _UP with _DN (or _DOWN).")

    group = parser.add_argument_group(
        title="Genes", description="Add gene expression profiles to output.")
    group.add_argument(
        "--genes",
        default=[],
        action="append",
        help="Comma-separated list of IDs (e.g. probes, gene names) "
        "to include.")

    args = parser.parse_args()
    assert args.expression_files, \
           "Please specify an expression data set to score."
    expression_files = []
    for x in args.expression_files:
        xg = glob.glob(x)
        assert xg, "I could not find the expression file: %s" % x
        expression_files.extend(xg)
    for x in expression_files:
        assert os.path.exists(x), \
           "I could not find the expression file: %s" % x
    assert args.outfile, "Please specify the name of an outfile."

    if args.num_procs < 1 or args.num_procs > 100:
        parser.error("Please specify between 1 and 100 processes.")
    assert args.pvalue > 0 and args.pvalue <= 1, \
           "Invalid pvalue %g" % args.pvalue

    assert args.geneset_files, "Please specify one or more geneset files."
    for x in args.geneset_files:
        assert os.path.exists(x), "I could not find the gene set file: %s" % x
    assert args.all_gene_sets or args.gene_set or args.any_matching_gene_sets,\
           "Please specify one or more gene sets to score."
    if args.all_gene_sets:
        assert not args.gene_set and not args.any_matching_gene_sets
    if args.any_matching_gene_sets:
        assert not args.gene_set and not args.all_gene_sets

    #if args.num_procs > 1:
    #    raise NotImplementedError, "Doesn't work.  Matrix class decorator."

    if args.libpath:
        sys.path = args.libpath + sys.path
    # Import after the library path is set.
    #import time
    import multiprocessing
    from genomicode import genesetlib
    from genomicode import genepattern
    from genomicode import jmath

    #start_time = time.time()

    genepattern.fix_environ_path()

    gene_names = _parse_gene_names(args.genes)

    msg = "Reading gene set file."
    if len(args.geneset_files) > 1:
        msg = "Reading gene set files."
    print msg
    sys.stdout.flush()
    geneset2genes = {}  # name -> list of genes
    for filename in args.geneset_files:
        for x in genesetlib.read_genesets(filename):
            name, description, genes = x
            assert name not in geneset2genes, "Duplicate geneset: %s." % name
            geneset2genes[name] = genes

    genesets = args.gene_set
    if args.all_gene_sets or args.any_matching_gene_sets:
        genesets = sorted(geneset2genes)
    if args.automatch:
        genesets = match_gene_sets(genesets)
    #genesets = genesets[:10]

    matrix_names = [os.path.split(x)[1] for x in expression_files]

    print "Setting up jobs."
    sys.stdout.flush()
    ignore_gene_not_found = args.any_matching_gene_sets
    # list of gs_name, pos_genes, neg_genes, matrix_name, matrix_file
    # list of gene_name, None, None, matrix_name, matrix_file
    jobs = []
    for geneset in genesets:
        pos_gs, neg_gs = _parse_geneset(geneset)
        assert pos_gs in geneset2genes, \
               "I could not find gene set: %s" % pos_gs
        if neg_gs:
            assert neg_gs in geneset2genes, \
                   "I could not find gene set: %s" % neg_gs
        gs_name = pos_gs
        if neg_gs:
            gs_name = "%s/%s" % (pos_gs, neg_gs)

        pos_genes = geneset2genes[pos_gs]
        neg_genes = geneset2genes.get(neg_gs, [])

        if not pos_genes and not neg_genes:
            print "Empty gene set: %s.  Skipping." % gs_name
            continue

        for matrix_name, matrix_file in zip(matrix_names, expression_files):
            x = gs_name, pos_genes, neg_genes, matrix_name, matrix_file, \
                ignore_gene_not_found
            jobs.append(x)
    for name in gene_names:
        for matrix_name, matrix_file in zip(matrix_names, expression_files):
            x = name, None, None, matrix_name, matrix_file, None
            jobs.append(x)

    # Group the jobs into batches such that jobs that use the same
    # matrix are in the same batch.
    batched_jobs = {}  # matrix_file -> list of jobs
    for i in range(len(jobs)):
        batch = jobs[i][4]
        if batch not in batched_jobs:
            batched_jobs[batch] = []
        batched_jobs[batch].append(jobs[i])
    batched_jobs = batched_jobs.values()  # list of list of jobs

    # If there are too many gene sets to score for a file, split it up
    # into multiple batches.  Don't know the tradeoff between reading
    # a file twice and calculating more gene sets.
    while len(batched_jobs) < args.num_procs:
        # Find the largest job and split it into two.
        largest = i_largest = None
        for i in range(len(batched_jobs)):
            nj = len(batched_jobs[i])
            if nj > 1 and nj > largest:
                largest = nj
                i_largest = i
        if largest is None:
            break
        # Split i_largest in half.
        bj = batched_jobs[i_largest]
        i = len(bj) / 2
        j1, j2 = bj[:i], bj[i:]
        batched_jobs[i_largest] = j1
        batched_jobs.append(j2)

    job_str = "jobs"
    if len(jobs) == 1:
        job_str = "job"
    print "Scoring %d %s." % (len(jobs), job_str)
    sys.stdout.flush()
    manager = multiprocessing.Manager()
    lock = manager.Lock()
    pool = multiprocessing.Pool(args.num_procs)

    # (matrix, geneset, index, sample) -> GeneSetScore or GeneScore
    score_dict = {}
    results = []  # AsyncResults
    for batch in batched_jobs:
        fn_args = (batch, )
        fn_keywds = {}
        fn_keywds["lock"] = lock
        if args.num_procs == 1:
            x = score_many(batch)
            score_dict.update(x)
        else:
            x = pool.apply_async(score_many, fn_args, fn_keywds)
            results.append(x)
    pool.close()
    pool.join()
    for x in results:
        x = x.get()
        score_dict.update(x)

    all_matrix_samples = []
    all_genesets = []
    all_genes = []
    for (x, score) in score_dict.iteritems():
        matrix_name, gene_name, index, sample = x
        x = matrix_name, index, sample
        all_matrix_samples.append(x)
        if isinstance(score, GeneSetScore):
            all_genesets.append(gene_name)
        elif isinstance(score, GeneScore):
            all_genes.append(gene_name)
        else:
            raise AssertionError
    all_matrix_samples = sorted({}.fromkeys(all_matrix_samples))
    all_genesets = sorted({}.fromkeys(all_genesets))
    all_genes = sorted({}.fromkeys(all_genes))

    # Format the output.  Columns should be in order:
    # <SAMPLE> <FILE>
    # <GS SCORES> ... <GS DIRECTION> ... <GS PVALUE> ... <GS SIGNIFICANT> ...
    # <GENES> ...
    header = ["SAMPLE", "FILE"]
    x = ["", "direction", "pvalue", "significant"]
    for x in itertools.product(x, all_genesets):
        suffix, name = x
        x = "%s %s" % (name, suffix)
        x = x.strip()
        header = header + [x]
    for g in all_genes:
        header = header + [g]

    output = []
    output.append(header)
    for x in all_matrix_samples:
        matrix, index, sample = x
        #x = [scores[(matrix, x, index, sample)] for x in all_genesets]

        # Get the scores for the gene sets.
        keys = [(matrix, x, index, sample) for x in all_genesets]
        default = GeneSetScore("", "", "", "")
        scores = [score_dict.get(x, default).score for x in keys]
        directs = [score_dict.get(x, default).direction for x in keys]
        pvalues = [score_dict.get(x, default).pvalue for x in keys]
        signifs = []
        for x in zip(directs, pvalues):
            direct, pvalue = x
            x = ""
            if type(pvalue) is type(0.0) and pvalue < args.pvalue:
                x = direct
            signifs.append(x)

        # Get the scores for the genes.
        keys = [(matrix, x, index, sample) for x in all_genes]
        default = GeneScore("")
        gene_scores = [score_dict.get(x, default).score for x in keys]

        x = [sample, matrix] + \
            scores + directs + pvalues + signifs + gene_scores
        assert len(x) == len(header)
        output.append(x)

    if args.transpose:
        output = jmath.transpose(output)

    outhandle = open(args.outfile, 'w')
    for x in output:
        print >> outhandle, "\t".join(map(str, x))
    outhandle.close()

    print "Done."
Ejemplo n.º 3
0
def main():
    from optparse import OptionParser, OptionGroup

    usage = "usage: %prog [options] <bfrm_model> <dataset>"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("",
                      "--bfrm_path",
                      dest="bfrm_path",
                      default=None,
                      help="Specify the path to BFRM_project.")
    parser.add_option("",
                      "--matlab",
                      dest="matlab",
                      default="matlab",
                      help="Specify the command to run matlab.")
    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    # Parse the arguments.
    options, args = parser.parse_args()

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import this after the library path is set.
    import arrayio
    from genomicode import archive
    from genomicode import genepattern

    genepattern.fix_environ_path()

    if len(args) != 2:
        parser.error("Please specify files.")
    model_file, filename = args
    assert os.path.exists(model_file), "File not found: %s" % model_file
    assert os.path.exists(filename), "File not found: %s" % filename

    # Set up the files.
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read the matrix and convert to GCT format.
    x = arrayio.read(filename)
    MATRIX = arrayio.convert(x, to_format=arrayio.gct_format)
    print "Read data set with %d genes and %d samples." % (MATRIX.nrow(),
                                                           MATRIX.ncol())

    log_matrix(MATRIX)

    # Write out the data sets.
    write_dataset(file_layout.DATASET, MATRIX)

    # Save the BFRM model.
    write_model(model_file, file_layout)

    # Run BFRM projection.
    run_bfrm_project(file_layout, options.bfrm_path, options.matlab)

    # Generate output files.
    summarize_factor_scores(file_layout, options.python, options.arrayplot,
                            options.cluster, options.libpath)

    if options.archive:
        print "Archiving results."
        archive.zip_path(file_layout.ATTIC, noclobber=False)
        archive.zip_path(file_layout.BFRM, noclobber=False)

    print "Done."
Ejemplo n.º 4
0
def main():
    from optparse import OptionParser, OptionGroup
    
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option(
        "-r", "--rma", dest="rma_dataset", type="string", default=None,
        help="Specify the RMA-normalized data to analyze.")
    parser.add_option(
        "-m", "--mas5", dest="mas5_dataset", type="string", default=None,
        help="Specify the MAS5-normalized data to analyze.")
    parser.add_option(
        "-i", "--illu", dest="illu_dataset", type="string", default=None,
        help="Specify the Illumina data to analyze.")
    parser.add_option(
        "", "--sigdb_path", dest="sigdb_path", type="string", default=None,
        help="Location of the sigdb/ directory.")
    parser.add_option(
        "", "--sigtag", dest="signature_tags", default=[], action="append",
        help="Specify a specific tag to use.")
    parser.add_option(
        "", "--sigid", dest="signature_ids", default=[], action="append",
        help="Specify a specific signature to use.")
    parser.add_option(
        "", "--max_signatures", dest="max_signatures", type="int",
        default=None,
        help="Maximum number of signatures to run (for DEBUGGING).")
    parser.add_option(
        "-j", "", dest="num_procs", type="int", default=1,
        help="Number of jobs to run in parallel.")
    parser.add_option(
        "-z", "", dest="archive", action="store_true", default=False,
        help="Archive the individual signatures.  Helpful for GenePattern.")
    parser.add_option(
        "", "--libpath", dest="libpath", action="append", default=[],
        help="Add to the Python library search path.")
    parser.add_option(
        "-o", "--outpath", dest="outpath", type="string", default=None,
        help="Save files in this path.")
    parser.add_option(
        "", "--gp_imod_all_vars", dest="gp_imod_all_vars", type="string",
        default=None,
        help="Special internal variable for use with GenePattern "
        "interactive modules.")
    parser.add_option(
        "", "--debug_gp_imod_all_vars", action="store_true", default=False, 
        dest="debug_gp_imod_all_vars",
        )
    
    #group = OptionGroup(parser, "Normalization")
    #group.add_option(
    #    "", "--normalization", dest="normalization", default="MAS5",
    #    help="How was the data set normalized (default MAS5).")
    #group.add_option(
    #    "-l", "--log_data", dest="log_data", action="store_true",
    #    default=False,
    #    help="Log the MAS5 data before analyzing.")
    #parser.add_option_group(group)

    group = OptionGroup(parser, "Pybinreg")
    group.add_option(
        "", "--python", dest="python", default=None,
        help="Specify the command to run python.")
    group.add_option(
        "", "--matlab", dest="matlab", default=None,
        help="Specify the command to run matlab.")
    group.add_option(
        "", "--povray", dest="povray", default=None,
        help="Specify the command to run povray.")
    group.add_option(
        "", "--cluster", dest="cluster", default=None,
        help="Specify the command to run cluster.")
    group.add_option(
        "", "--binreg", dest="binreg_path", default=None,
        help="Specify the path to the BinReg2.0 code.")
    group.add_option(
        "", "--pybinreg", dest="pybinreg", default=None,
        help="Specify the command to run pybinreg.py.")
    group.add_option(
        "", "--arrayplot", dest="arrayplot", default=None,
        help="Specify the command to run arrayplot.")
    parser.add_option_group(group)

    options, args = parser.parse_args()
    #if len(args) < 1:
    #    #print sys.argv
    #    #print len(args), args
    #    parser.error("Please specify sigdb_path.")
    #elif len(args) > 1:
    #    parser.error("Too many arguments.")
    if args:
        parser.error("Too many arguments.")

    # DEBUG the gp_imod_all_vars variable.
    if options.debug_gp_imod_all_vars:
        assert not options.gp_imod_all_vars
        options.gp_imod_all_vars = (
            "mas5_expression_file_cb=file&mas5_expression_file_url=&"
            "rma_expression_file_cb=file&rma_expression_file_url=&"
            # Skip AKT signature.
            "sig_AKT=no&"
            # Change BCAT normalization.
            "sig_BCAT=yes (custom parameters)&"
            "sig_BCAT_apply_quantile_normalization=no&"
            "sig_BCAT_apply_shiftscale_normalization=no&"
            "sig_BCAT_num_genes=85&sig_BCAT_num_metagenes=2&"
            # No changes in E2F1.
            "sig_E2F1=yes (custom parameters)&"
            "sig_E2F1_apply_quantile_normalization=yes&"
            "sig_E2F1_apply_shiftscale_normalization=yes&"
            "sig_E2F1_num_genes=150&sig_E2F1_num_metagenes=2&"
            # Change genes in EGFR.
            "sig_EGFR=yes (custom parameters)&"
            "sig_EGFR_apply_quantile_normalization=no&"
            "sig_EGFR_apply_shiftscale_normalization=yes&"
            #"sig_EGFR_num_genes=50000&sig_EGFR_num_metagenes=2&"
            "sig_EGFR_num_genes=501&sig_EGFR_num_metagenes=2&"
            # Change quantile, genes, metagenes in ER.
            "sig_ER=yes (custom parameters)&"
            "sig_ER_apply_quantile_normalization=no&"
            "sig_ER_apply_shiftscale_normalization=yes&"
            "sig_ER_num_genes=150&sig_ER_num_metagenes=3&"
            "sig_HER2=yes (default parameters)&"
            "sig_IFNalpha=yes (default parameters)&"
            "sig_IFNgamma=yes (default parameters)&"
            "sig_MYC=yes (default parameters)&"
            "sig_P53=yes (default parameters)&"
            "sig_P63=yes (default parameters)&"
            "sig_PI3K=yes (default parameters)&"
            "sig_PR=yes (default parameters)&"
            "sig_RAS=yes (default parameters)&"
            "sig_SRC=yes (default parameters)&"
            "sig_STAT3=yes (default parameters)&"
            "sig_TGFB=yes (default parameters)&"
            "sig_TNFa=yes (default parameters)&"
            "which_signatures=I choose myself"
            )
        
    datafile_rma = datafile_mas5 = datafile_illu = None
    if options.rma_dataset is not None:
        assert os.path.exists(options.rma_dataset), \
               "RMA file not found: %s" % options.rma_dataset
        datafile_rma = os.path.realpath(options.rma_dataset)
    if options.mas5_dataset is not None:
        assert os.path.exists(options.mas5_dataset), \
               "MAS5 file not found: %s" % options.mas5_dataset
        datafile_mas5 = os.path.realpath(options.mas5_dataset)
    if options.illu_dataset is not None:
        assert os.path.exists(options.illu_dataset), \
               "ILLU file not found: %s" % options.illu_dataset
        datafile_illu = os.path.realpath(options.illu_dataset)
    assert datafile_rma or datafile_mas5 or datafile_illu, \
           "Please specify at least one data set."

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import after the library path is set.
    import time
    import arrayio
    from genomicode import config
    from genomicode import parallel
    from genomicode import archive
    from genomicode import hashlib
    from genomicode import matrixlib
    from genomicode import genepattern
    
    #sigdb_path, = args
    x = options.sigdb_path or config.sigdb_path
    sigdb_path = os.path.realpath(x)
    assert os.path.exists(sigdb_path), \
           "I could not find the signatures database: %s." % sigdb_path

    start_time = time.time()
    
    genepattern.fix_environ_path()
    
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read the signatures and select the ones to score.
    # BUG: Should allow this to be specified on the command line.
    desired_tags = ["Pathway"]  # default
    if options.signature_tags:
        desired_tags = options.signature_tags[:]
    all_normalization = ["RMA", "MAS5", "ILLU"]
    desired_normalization = []
    if datafile_rma is not None:   # RMA datafile is specified.
        desired_normalization.append("RMA")
    if datafile_mas5 is not None:  # MAS5 datafile is specified.
        desired_normalization.append("MAS5")
    if datafile_illu is not None:  # ILLU datafile is specified.
        desired_normalization.append("ILLU")
        
    # If any signature IDs are specified, then use only those IDs and
    # ignore the desired tags.
    print "Reading signature database: %s." % sigdb_path
    desired_ids = []
    if options.signature_ids:
        desired_ids = options.signature_ids[:]
    x = read_signatures(
        sigdb_path, all_normalization, desired_ids, desired_tags)
    signatures = x
    orig_signatures = signatures[:]

    # Filter for just the normalization that we have data files for.
    # Keep track of why we filtered out certain signatures.
    why_dropped = {}  # ID -> explanation as string
    good = []
    for sig in signatures:
        if sig.Normalization.upper() in desired_normalization:
            good.append(sig)
            continue
        x = "Signature requires %s normalized data, but it was not provided."%(
            sig.Normalization.upper())
        why_dropped[sig.xID] = x
    signatures = good
    assert signatures, "No signatures available."

    # Process additional parameters from GenePattern.
    # o Do this before max_signatures, so that the maximum signatures
    #   is selected only out of the ones that the user specified.
    # o Do this before names and paths, so the variables will be
    #   aligned.
    # gp_imod_all_vars can be None or "".
    if options.gp_imod_all_vars:
        x = process_gp_imod_all_vars(
            options.gp_imod_all_vars, signatures, why_dropped)
        signatures, why_dropped = x

    sys.stdout.flush()
    DATA_rma = DATA_mas5 = DATA_illu = None
    if datafile_rma is not None:
        print "Reading RMA file: %s" % datafile_rma
        DATA_rma = arrayio.read(datafile_rma)
        DATA_rma = arrayio.convert(DATA_rma, to_format=arrayio.gct_format)
    if datafile_mas5 is not None:
        print "Reading MAS5 file: %s" % datafile_mas5
        DATA_mas5 = arrayio.read(datafile_mas5)
        DATA_mas5 = arrayio.convert(DATA_mas5, to_format=arrayio.gct_format)
    if datafile_illu is not None:
        print "Reading ILLU file: %s" % datafile_illu
        DATA_illu = arrayio.read(datafile_illu)
        DATA_illu = arrayio.convert(DATA_illu, to_format=arrayio.gct_format)
    # Don't handle the log.  Let pybinreg do it.
    # Make sure the data sets contain the same samples.  Align them if
    # necessary.
    DATA_all = [
        ("DATA_rma", DATA_rma), ("DATA_mas5", DATA_mas5),
        ("DATA_illu", DATA_illu)]
    DATA_all = [x for x in DATA_all if x[1]]
    for i in range(1, len(DATA_all)):
        key1, data1 = DATA_all[0]
        key2, data2 = DATA_all[i]
        assert key1 != key2
        assert data1 and data2
        assert data1.ncol() == data2.ncol(), \
               "%s and %s data sets have different numbers of samples." % (
            key1, key2)
        if matrixlib.are_cols_aligned(data1, data2):
            continue
        x = matrixlib.align_cols(data1, data2)
        data1_new, data2_new = x
        assert matrixlib.are_cols_aligned(data1_new, data2_new)
        # The samples in data1 (the reference) should not be changed.
        assert data1.ncol() == data1_new.ncol(), \
               "%s and %s data sets have different samples" % (
            key1, key2)
        assert matrixlib.are_cols_aligned(data1, data1_new)
        DATA_all[i] = key2, data2_new
    for key, data in DATA_all:
        if key == "DATA_rma":
            DATA_rma = data
        elif key == "DATA_mas5":
            DATA_mas5 = data
        elif key == "DATA_illu":
            DATA_illu = data
        else:
            raise AssertionError, "Unknown key: %s" % key
    print "Writing aligned signal files."
    if DATA_rma:
        arrayio.gct_format.write(
            DATA_rma, open(file_layout.DATASET_RMA, 'w'))
    if DATA_mas5:
        arrayio.gct_format.write(
            DATA_mas5, open(file_layout.DATASET_MAS5, 'w'))
    if DATA_illu:
        arrayio.gct_format.write(
            DATA_illu, open(file_layout.DATASET_ILLU, 'w'))

    # Figure out the names and paths for each signature.
    print "Finding signatures."
    names = [None] * len(signatures)   # SIG19_AKT[_modified]
    paths = [None] * len(signatures)   # <path>/SIG19_AKT[_modified]
    for i, sig in enumerate(signatures):
        name = "SIG%02d_%s" % (sig.xID, hashlib.hash_var(sig.Name))
        # If the user has modified the signature from the default
        # parameters, then make a note of it.
        if getattr(sig, "Changed", False):
            name = "%s_modified" % name
        outpath = os.path.join(file_layout.OUTPATH, name)
        names[i] = name
        paths[i] = outpath

    if options.max_signatures is not None:
        signatures = signatures[:options.max_signatures]

    # Make a list of the jobs.
    jobs = []  # list of cmd, outpath, outfile
    for i, sig in enumerate(signatures):
        name, outpath = names[i], paths[i]
        #print "Generating signature %s [%d:%d]" % (
        #    name, i+1, len(signatures))
        #sys.stdout.flush()
        
        quantile_normalize = False
        assert sig.Quantile.upper() in ["YES", "NO"]
        if sig.Quantile.upper() == "YES":
            quantile_normalize = True
        shift_scale_normalize = False
        assert sig.Shift_Scale.upper() in ["YES", "NO"]
        if sig.Shift_Scale.upper() == "YES":
            shift_scale_normalize = True
        
        #outfile = os.path.join(files.outpath, "%s.out.txt" % name)
        outfile = os.path.join(outpath, "out.txt")

        if sig.Normalization.upper() == "RMA":
            datafile = file_layout.DATASET_RMA
            assert DATA_rma
        elif sig.Normalization.upper() == "MAS5":
            datafile = file_layout.DATASET_MAS5
            assert DATA_mas5
        elif sig.Normalization.upper() == "ILLU":
            datafile = file_layout.DATASET_ILLU
            assert DATA_illu
        else:
            raise AssertionError, "Unknown normalization."

        # If the entire analysis should be archived, then go ahead and
        # archive each of the pybinreg runs too.  This will prevent
        # large analyses from taking up too much disk space.  The
        # drawback is that the files that are archived are no longer
        # available for use here.  Hopefully this won't be a problem.
        cmd = make_pybinreg_cmd(
            options.pybinreg, options.python, options.binreg_path,
            options.matlab, options.arrayplot, options.povray,
            options.cluster, options.libpath,
            outpath, options.archive, sig.Genes, sig.Metagenes,
            quantile_normalize, shift_scale_normalize,
            sig.Train0, sig.Train1, datafile)
        x = cmd, outpath, outfile
        jobs.append(x)

    # Run each of the jobs.
    if options.num_procs < 1 or options.num_procs > 100:
        parser.error("Please specify between 1 and 100 processes.")
    if options.num_procs > 1:
        if parallel._find_parallel():
            num_sigs = min(options.num_procs, len(jobs))
            if num_sigs > 1:
                print "Predicting %d signatures at a time." % num_sigs
        else:
            print("I could not find GNU parallel.  "
                  "Predicting 1 signature at a time.")
            options.num_procs = 1
        sys.stdout.flush()

    DEBUG = False   # Can disable pybinreg temporarily for debugging.
    if not DEBUG:  
        if options.num_procs <= 1:
            for x in jobs:
                cmd, outpath, outfile = x
                run_one_pybinreg(cmd, outpath, outfile)
        else:
            run_many_pybinreg(jobs, options.num_procs)

    if signatures:
        print "Extracting the reports from each signature."
        report_files = extract_reports(names, paths, file_layout)
        
        print "Combining probabilities from each of the signatures."
        summarize_probabilities(signatures, names, paths, file_layout)

        print "Making heatmap of the results."
        sys.stdout.flush()
        summarize_heatmap(
            options.python, options.arrayplot, options.cluster,
            options.libpath, file_layout)

        print "Summarizing signatures."
        summarize_signatures(signatures, file_layout)

        print "Making a report."
        analysis_name = make_analysis_name(options)
        summarize_report(
            analysis_name, signatures, orig_signatures, report_files,
            start_time, why_dropped, file_layout)

    if options.archive:
        print "Compressing results."
        sys.stdout.flush()
        archive.zip_path(file_layout.ATTIC)
        for i, sig in enumerate(signatures):
            name, outpath = names[i], paths[i]
            archive.zip_path(outpath)
    
    print "Done."
Ejemplo n.º 5
0
def main():
    from optparse import OptionParser, OptionGroup

    usage = "usage: %prog [options] <dataset>"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--bfrm_bin",
                      dest="bfrm_bin",
                      default=None,
                      help="Specify the path to the BFRM binary.")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    group = OptionGroup(parser, "Filtering")
    group.add_option(
        "--filter_mean",
        dest="filter_mean",
        type=float,
        default=None,
        help="Remove this portion of genes based on mean expression.")
    group.add_option("--filter_var",
                     dest="filter_var",
                     type=float,
                     default=None,
                     help="Remove this portion of genes based on variance.")
    group.add_option("--cutoff",
                     dest="cutoff",
                     type=float,
                     default=0.99,
                     help="Cutoff probability for a gene to be in a factor.")
    parser.add_option_group(group)

    group = OptionGroup(parser, "BFRM Parameters")
    group.add_option("--nc",
                     dest="num_control_vars",
                     type="int",
                     default=None,
                     help="Specify the number of control variables to use.")
    group.add_option(
        "--num_factors",
        dest="num_factors",
        type="int",
        default=None,
        help="The number of factors to fit.  "
        "For evolutionary search, starts with this number of factors.")
    group.add_option(
        "--design_file",
        dest="design_file",
        default=None,
        help="A file containing a matrix with additional design variables.")
    group.add_option(
        "--nucleus_file",
        dest="nucleus_file",
        default=None,
        help="A file that contains the genes to start the evolution.  "
        "This should be a text file that contains a whitespace-separated "
        "list of genes.  If this or --nucleus_geneset is given, "
        "the evolutionary search will be turned on.")
    group.add_option(
        "--nucleus_geneset",
        dest="nucleus_geneset",
        default=None,
        help="A gene set that contains the genes to start the evolution.  "
        "Format: <gmx/gmt_file>[,<geneset>,<geneset>,...]")
    group.add_option("--evol_max_factors",
                     dest="evol_max_factors",
                     default=None,
                     help="Maximum number of factors for the evolution.")
    group.add_option("--evol_max_genes",
                     dest="evol_max_genes",
                     default=None,
                     help="Maximum number of genes for the evolution.")
    parser.add_option_group(group)

    # Parse the arguments.
    options, args = parser.parse_args()

    if options.cutoff <= 0 or options.cutoff > 1:
        parser.error("Cutoff probability should be between 0 and 1.")
    if options.filter_mean and (options.filter_mean < 0
                                or options.filter_mean >= 1):
        parser.error("filter_mean filter should be between 0 and 1.")
    if options.filter_var and (options.filter_var < 0
                               or options.filter_var >= 1):
        parser.error("filter_var filter should be between 0 and 1.")

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import this after the library path is set.
    import arrayio
    from genomicode import archive
    from genomicode import genepattern

    genepattern.fix_environ_path()

    if len(args) != 1:
        parser.error("Please specify a file to factor.")
    filename, = args
    assert os.path.exists(filename), "File not found: %s" % filename

    if options.nucleus_file and options.nucleus_geneset:
        parser.error("Please specify either nucleus_file or nucleus_geneset.")
    nucleus = None
    if options.nucleus_file:
        nucleus = _read_nucleus_file(options.nucleus_file)
    elif options.nucleus_geneset:
        nucleus = _read_nucleus_geneset(options.nucleus_geneset)

    # Not sure if this is necessary.  Don't know if BFRM will provide
    # a default if not given.
    if nucleus:
        assert options.num_factors, "Please specify number of factors."

    # Set up the files.
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read the matrix and convert to GCT format.
    x = arrayio.read(filename)
    MATRIX_orig = arrayio.convert(x, to_format=arrayio.gct_format)
    print "Read data set with %d genes and %d samples." % (MATRIX_orig.nrow(),
                                                           MATRIX_orig.ncol())

    # Make a copy so that in-place changes (like log_matrix) won't
    # affect the original matrix.
    MATRIX = MATRIX_orig.matrix()

    # Log the data set if necessary.
    log_matrix(MATRIX)

    # Filter out based on mean and varian
    MATRIX = filter_dataset(MATRIX, options.filter_mean, options.filter_var)
    if MATRIX.nrow() != MATRIX_orig.nrow():
        print "Filtered from %d genes to %d." % (MATRIX_orig.nrow(),
                                                 MATRIX.nrow())

    # Write out the data sets.
    write_dataset(file_layout.DATASET_ORIG, MATRIX_orig)
    write_dataset(file_layout.DATASET, MATRIX)

    # Run BFRM.
    DEBUG = False
    if not DEBUG:
        run_bfrm(file_layout, options.bfrm_bin, options.num_control_vars,
                 options.num_factors, options.design_file, nucleus,
                 options.evol_max_factors, options.evol_max_genes)

    # Generate output files.
    summarize_factor_scores(file_layout, options.cutoff, options.python,
                            options.arrayplot, options.cluster,
                            options.libpath)
    summarize_gene_factor_probs(file_layout, options.cutoff, options.python,
                                options.arrayplot, options.cluster,
                                options.libpath)
    summarize_factor_geneset(file_layout, options.cutoff)

    # BFRM model file should always be archived.
    archive.zip_path(file_layout.BFRM, noclobber=False)

    if options.archive:
        print "Archiving results."
        archive.zip_path(file_layout.ATTIC, noclobber=False)

    print "Done."
Ejemplo n.º 6
0
def main():
    from optparse import OptionParser, OptionGroup

    # matrix_file should be a pathway x sample file.
    usage = "usage: %prog [options] <dataset>"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("",
                      "--selap",
                      dest="selap_path",
                      default=None,
                      help="Specify the path to SELAPv3.")
    parser.add_option("",
                      "--matlab",
                      dest="matlab",
                      default="matlab",
                      help="Specify the command to run matlab.")
    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    # This doesn't give as much control over exactly which python
    # version is run.
    #parser.add_option(
    #    "", "--binpath", dest="binpath", action="append", default=[],
    #    help="Add to the binary search path.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    group = OptionGroup(parser, "Model Parameters")
    # Higher numbers have more groups.
    # Range from 0 and lower.
    group.add_option(
        "-p",
        "--penalty",
        dest="penalty",
        default="-33",
        help="Penalty for tuning number of subgroups (default -33).")
    group.add_option(
        "-m",
        "--model",
        dest="model_file",
        default=None,
        help="Specify a file that contains a pre-built subtype model.")
    parser.add_option_group(group)

    # Parse the input arguments.
    options, args = parser.parse_args()
    if len(args) != 1:
        parser.error("Please specify a file with pathway probabilities.")
    filename, = args
    if not os.path.exists(filename):
        parser.error("I could not find file %s." % filename)

    if options.penalty.find(".") >= 0:
        parser.error("Penalties should be integers.")

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import after the library path is set.
    import arrayio
    from genomicode import genepattern
    from genomicode import archive
    from genomicode import parselib

    genepattern.fix_environ_path()

    # Maximum number of models that someone can create at a time.
    MAX_MODELS = 50

    # Allow people to supply more than one penalty.  Parse into a list
    # of ranges.  Penalties must be integers.
    penalties = []
    for (start, end) in parselib.parse_ranges(options.penalty):
        penalties.extend(range(start, end + 1))
    assert len(penalties) <= MAX_MODELS, "Too many penalties (max is %d)." % \
           MAX_MODELS
    assert penalties, "At least one penalty must be specified."
    assert not (options.model_file and len(penalties) != 1)
    for p in penalties:
        assert p <= 0, "Penalties should be negative."

    num_analyses = len(penalties)

    # Set up the files.
    file_layout = make_file_layout(options.outpath, num_analyses, penalties[0])
    init_paths(file_layout)

    # Read the matrix and convert to GCT format.
    MATRIX = arrayio.read(filename)
    MATRIX = arrayio.convert(MATRIX, to_format=arrayio.gct_format)

    # Align this matrix to the SELAP model, if it already exists.
    if options.model_file:
        MATRIX = align_dataset(MATRIX, options.model_file)
    # Write out the data set.
    write_dataset(file_layout.DATASET, MATRIX)

    for penalty in penalties:
        # Set up the files.
        file_layout = make_file_layout(options.outpath, num_analyses, penalty)
        init_paths(file_layout)

        # Make the model.
        write_selap_dataset(file_layout)
        if options.model_file:
            write_model(options.model_file, file_layout)
        else:
            make_model(options.selap_path, penalty, file_layout,
                       options.matlab)

        # Predict the subgroups.
        predict_subgroups(options.selap_path, file_layout, options.matlab)

        # Generate some files for output.
        summarize_predictions(file_layout)
        summarize_heatmap(options.python, options.arrayplot, options.cluster,
                          file_layout, options.libpath)

        # Archive the SELAP stuff, and any other big files.
        if options.archive:
            print "Archiving results."
            archive.zip_path(file_layout.SELAP, noclobber=False)
            archive.zip_path(file_layout.ATTIC, noclobber=False)

        if num_analyses <= 1:
            continue
        # Now do some cleanup if multiple analyses were requested.

        # If there were multiple penalties specified, make a copy of
        # some files for convenience.
        fl = file_layout
        files_to_copy = [
            (fl.PREDICTIONS_PCL, fl.GLOBAL_PREDICTIONS_PCL),
            (fl.PREDICTIONS_PNG, fl.GLOBAL_PREDICTIONS_PNG),
        ]
        for src, dst in files_to_copy:
            assert os.path.exists(src)
            os.system("cp -p '%s' '%s'" % (src, dst))

        if options.archive:
            archive.zip_path(file_layout.ANALYSIS)
        sys.stdout.flush()

    if num_analyses > 1:
        summarize_subgroups(options.outpath, num_analyses, penalties)

    print "Done."