Esempio n. 1
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        """log the input file"""
        import math
        import arrayio
        from genomicode import filelib
        from genomicode import binreg

        signal_file = in_data.identifier
        filelib.assert_exists_nz(signal_file)

        M = arrayio.read(signal_file)
        assert not binreg.is_logged_array_data(M), 'the file is logged'
        # Change the matrix in place.
        X = M._X
        for i in range(len(X)):
            for j in range(len(X[i])):
                x = X[i][j]
                if x is None:
                    continue
                x = float(x)
                if x < 1:
                    x = 1
                x = math.log(x, 2)
                X[i][j] = x

        M_c = arrayio.convert(M, to_format=arrayio.tab_delimited_format)

        handle = open(outfile, 'w')
        arrayio.tab_delimited_format.write(M_c, handle)
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        """check an input file is xls or xlsx format"""
        import arrayio

        in_filename = in_data.identifier
        # Why is this necessary?
        #try:
        #    x = userfile._unhash_storefile(in_data.identifier)
        #    real_name = x[1]
        #except:
        #    pass

        #if (in_data.identifier.endswith('.gz') or in_filename.endswith('.gz')):
        #    unzip_file = module_utils.gunzip(in_data.identifier)
        #else:
        #    unzip_file = in_data.identifier

        ## M = None
        ## xls_file = None
        ## txt_file = unzip_file
        ## try:
        ##     xlrd.open_workbook(unzip_file)
        ##     xls_file = 'tmp.xls'
        ## # XLRDError?  Is this a bug?  This is not the way to catch exception.
        ## except Exception, XLRDError:
        ##     try:
        ##         # Test this.  book not used?
        ##         book = openpyxl.load_workbook(unzip_file)
        ##         xls_file = 'tmp.xlsx'
        ##     except Exception, InvalidFileException:
        ##         xls_file = None
        ##     except (SystemError, MemoryError, KeyError), x:
        ##         raise

        ## if xls_file:
        ##     shutil.copyfile(unzip_file, xls_file)
        ##     xls2txt_path = config.xls2txt
        ##     xls2txt_BIN = module_utils.which(xls2txt_path)
        ##     assert xls2txt_BIN, 'cannot find the %s' % xls2txt_path
        ##     f = file('tmp1.txt', 'w')
        ##     command = ['python', xls2txt_BIN, xls_file]
        ##     process = subprocess.Popen(command,
        ##                                shell=False,
        ##                                stdout=f,
        ##                                stderr=subprocess.PIPE)
        ##     error_message = process.communicate()[1]
        ##     if error_message:
        ##         raise ValueError(error_message)
        ##     os.remove(xls_file)
        ##     f.close()
        ##     txt_file = 'tmp1.txt'

        to_format = arrayio.tdf
        MATRIX = arrayio.read(in_filename)
        MATRIX_c = arrayio.convert(MATRIX, to_format=to_format)
        to_format.write(MATRIX_c, open(outfile, 'w'))
Esempio n. 3
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     from genomicode import quantnorm
     import arrayio
     from genomicode import filelib
     in_data = antecedents
     M = arrayio.read(in_data.identifier)
     Y = quantnorm.normalize(M)
     f = file(outfile, 'w')
     Y_c = arrayio.convert(Y, to_format=arrayio.pcl_format)
     arrayio.tab_delimited_format.write(Y_c, f)
     f.close()
     assert filelib.exists_nz(outfile), (
         'the output file %s for quantile fails' % outfile
     )
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        import arrayio
        from Betsy import module_utils
        from genomicode import filelib
        from genomicode import config
        in_data = antecedents
        bfrm_path = config.bfrmnorm
        bfrm_BIN = module_utils.which(bfrm_path)
        assert bfrm_BIN, 'cannot find the %s' % bfrm_path
        num_factor = 1
        #num_factor = 10
        if 'num_factors' in user_options.keys():
            num_factor = int(user_options['num_factors'])
            assert num_factor >= 1, 'the num_factor should be >=1'
            # What is single_object?
            #M = arrayio.read(single_object.identifier)
            M = arrayio.read(in_data.identifier)
            col_num = M.ncol()
            assert num_factor <= col_num, (
                'the num_factor should be less than %d' % col_num)

        tmp = 'tmp_dir'
        command = [
            'python', bfrm_BIN, in_data.identifier, '-f',
            str(num_factor), '-o', tmp
        ]
        process = subprocess.Popen(command,
                                   shell=False,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        error_message = process.communicate()[1]
        if error_message:
            raise ValueError(error_message)

        assert filelib.exists_nz(tmp), (
            'the output dir %s for bfrm_normalize fails' % tmp)
        assert filelib.exists_nz(os.path.join(tmp, 'normalized.gct')), (
            'the output gct file for bfrm_normalize fails')
        out = os.path.join(tmp, 'normalized.gct')
        M = arrayio.read(out)
        M_new = arrayio.convert(M, to_format=arrayio.pcl_format)
        f = file(outfile, 'w')
        arrayio.tab_delimited_format.write(M_new, f)
        f.close()
Esempio n. 5
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        import arrayio
        from genomicode import jmath
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        metadata = {}

        norm_para = ["variance", "sum_of_squares"]
        assert "gene_normalize" in out_attributes
        normalize = out_attributes["gene_normalize"]
        assert normalize in norm_para, \
               "Invalid normalize option: %s" % normalize

        if normalize == "variance":
            f = file(outfile, 'w')
            M = arrayio.read(in_data.identifier, format=arrayio.pcl_format)
            M_n = jmath.safe_norm_mv(M.slice())
            M._X = M_n
            M_c = arrayio.convert(M, to_format=arrayio.pcl_format)
            arrayio.pcl_format.write(M_c, f)
            f.close()
        elif normalize == "sum_of_squares":
            cluster = mlib.get_config("cluster", which_assert_file=True)
            sq = parallel.quote
            cmd = [
                sq(cluster),
                "-f",
                sq(in_data.identifier),
                "-ng",
                "-u",
                outfile,
            ]
            parallel.sshell(cmd)
            metadata["command"] = cmd
            outputfile = outfile + '.nrm'
            filelib.assert_exists_nz(outputfile)
            os.rename(outputfile, outfile)

        filelib.assert_exists_nz(outfile)
        return metadata
def run_cluster30(filename, algorithm, user_options, **more_args):
    import arrayio
    from genomicode import cluster30
    from Betsy import module_utils as mlib

    MATRIX_FILE = "data.pcl"

    DISTANCE_MEASURES = cluster30.DIST2ID.keys()
    YESNO = ["yes", "no"]

    cluster_genes = mlib.get_user_option(user_options,
                                         "cluster_genes",
                                         not_empty=True,
                                         allowed_values=YESNO)
    cluster_arrays = mlib.get_user_option(user_options,
                                          "cluster_arrays",
                                          not_empty=True,
                                          allowed_values=YESNO)
    distance_metric = mlib.get_user_option(user_options,
                                           "distance_measure",
                                           not_empty=True,
                                           allowed_values=DISTANCE_MEASURES)

    # Make a PCL-formatted file for cluster 3.0.  It might
    # misinterpret the columns of a tab-delimited file.
    matrix = arrayio.read(filename)
    matrix = arrayio.convert(matrix, to_format=arrayio.pcl_format)
    arrayio.write(matrix, open(MATRIX_FILE, 'w'))

    jobname = "cluster"
    cmd = cluster30.cluster30_file(MATRIX_FILE, (cluster_genes == "yes"),
                                   (cluster_arrays == "yes"),
                                   algorithm,
                                   distance=distance_metric,
                                   jobname=jobname,
                                   **more_args)

    # Find the output files and name them appropriately.
    cluster_files = cluster30._find_cluster_files(jobname)
    fix_cluster30_dup_header(cluster_files["cdt"])

    return cmd, cluster_files
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     """convert signal file to pcl format"""
     import shutil
     import arrayio
     from genomicode import filelib
     in_data = antecedents
     f = file(outfile, 'w')
     M = arrayio.choose_format(in_data.identifier)
     if M.__name__[8:-7] == 'pcl':
         shutil.copyfile(in_data.identifier, outfile)
         f.close()
     else:
         M = arrayio.read(in_data.identifier)
         M_c = arrayio.convert(M, to_format=arrayio.pcl_format)
         arrayio.pcl_format.write(M_c, f)
         f.close()
 
     
     assert filelib.exists_nz(outfile), (
         'the output file %s for convert_signal_to_pcl fails' % outfile
     )
Esempio n. 8
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import arrayio
        from genomicode import arrayplatformlib as apl

        MATRIX = arrayio.read(antecedents.identifier)

        # Converter will just use the first two columns for NAME and
        # Description.  Try to find better ones.
        # Column 1: PROBE_ID, GENE_ID, <col 1>
        # Column 2: GENE_SYMBOL, <col 2>
        num_headers = len(MATRIX.row_names())
        cat2header = apl.categorize_headers(MATRIX, remove_version=True)

        h1 = cat2header.get(apl.PROBE_ID)
        if not h1:
            h1 = cat2header.get(apl.GENE_ID)
        if not h1 and num_headers:
            h1 = MATRIX.row_names()[0]

        h2 = cat2header.get(apl.GENE_SYMBOL)
        if not h2 and num_headers >= 2:
            h2 = MATRIX.row_names()[1]

        if h2:
            i = MATRIX._row_order.index(h2)
            MATRIX._row_order.pop(i)
            MATRIX._row_order.insert(0, h2)
        if h1:
            i = MATRIX._row_order.index(h1)
            MATRIX._row_order.pop(i)
            MATRIX._row_order.insert(0, h1)

        MATRIX_c = arrayio.convert(MATRIX, to_format=arrayio.gct_format)

        arrayio.gct_format.write(MATRIX_c, open(outfile, 'w'))
Esempio n. 9
0
def main():
    #from optparse import OptionParser, OptionGroup
    from optparse import OptionParser

    usage = "usage: %prog [options] <file1> <file2> ..."
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("-f",
                      "--num_factors",
                      dest="num_factors",
                      type="int",
                      default=15,
                      help="Number of factors to use for normalization.")
    # Any string in the control probe file can be a control probe.
    # Delimited by tabs and newlines.
    parser.add_option("",
                      "--control_probe_file",
                      dest="control_probe_file",
                      default=None,
                      help="File that contains the control probes.")
    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--bfrm",
                      dest="bfrm_path",
                      default=None,
                      help="Specify the path to the BFRM_normalize directory.")
    parser.add_option("",
                      "--matlab",
                      dest="matlab",
                      default="matlab",
                      help="Specify the command to run matlab.")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--povray",
                      dest="povray",
                      default="povray",
                      help="Specify the command to run povray.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    # Parse the arguments.
    options, args = parser.parse_args()

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import this after the library path is set.
    import time
    import arrayio
    from genomicode import filelib
    from genomicode import archive
    from genomicode import genepattern

    start_time = time.time()

    genepattern.fix_environ_path()

    if not args:
        parser.error("Please specify files to normalize.")
    filenames = args
    names = [os.path.split(x)[-1] for x in filenames]
    for filename in filenames:
        assert filelib.exists(filename), "File not found: %s" % filename

    # Check to make sure value for num_factors is reasonable.
    MIN_FACTORS, MAX_FACTORS = 1, 100
    if options.num_factors < MIN_FACTORS:
        if MIN_FACTORS == 1:
            parser.error("At least %d factor is required." % MIN_FACTORS)
        else:
            parser.error("At least %d factors are required." % MIN_FACTORS)
    elif options.num_factors > MAX_FACTORS:
        parser.error("%d factors is too many.  Maximum is %d." %
                     (options.num_factors, MAX_FACTORS))

    # Set up the files.
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read each of the input files and align them.
    matrices = read_matrices(filenames)

    # Make sure the number of factors don't exceed the size of the
    # matrices.
    if matrices and options.num_factors > matrices[0].nrow():
        parser.error("Too many factors.")

    # Standardize each of the matrices to GCT format.
    if 1:  # for debugging
        for i in range(len(matrices)):
            matrices[i] = arrayio.convert(matrices[i],
                                          to_format=arrayio.gct_format)
        write_dataset(file_layout.DS_ORIG, matrices)

    # Log each of the matrices if needed.
    if 1:  # for debugging
        log_matrices(names, matrices)
        write_dataset(file_layout.DS_PROC, matrices)
        sys.stdout.flush()

    # Format the parameters and output files for bfrm.
    if 1:  # for debugging
        run_bfrm(options.bfrm_path, options.num_factors,
                 options.control_probe_file, file_layout, options.matlab)

    # Generate some files for output.
    if 1:  # for debugging
        summarize_dataset(file_layout)
        summarize_filtered_genes(file_layout)
    summarize_heatmaps(options.python, options.arrayplot, options.cluster,
                       file_layout, options.libpath)
    summarize_pca(options.povray, file_layout, matrices)
    summarize_report(filenames, matrices, options.num_factors, start_time,
                     file_layout)

    # Archive the BFRM stuff, and the big files.
    if options.archive:
        print "Archiving results."
        archive.zip_path(file_layout.BFRM, noclobber=False)
        archive.zip_path(file_layout.ATTIC, noclobber=False)
        #archive.zip_path(file_layout.DS_PROC, noclobber=False)
        #archive.zip_path(file_layout.DS_FINAL, noclobber=False)

    print "Done."
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        from Betsy import module_utils
        from genomicode import config
        from genomicode import filelib
        import arrayio

        data_node, cls_node = antecedents
        tmp = os.path.join(".", 'tmp.txt')
        f = file(tmp, 'w')
        M = arrayio.read(data_node.identifier)
        M_c = arrayio.convert(M, to_format=arrayio.gct_format)
        arrayio.gct_format.write(M_c, f)
        f.close()
        module_name = 'ClassNeighbors'
        gp_parameters = dict()
        gp_parameters['data.filename'] = tmp
        gp_parameters['class.filename'] = cls_node.identifier
        if 'cn_num_neighbors' in user_options:
            gp_parameters['num.neighbors'] = str(
                user_options['cn_num_neighbors'])

        if 'cn_num_perm' in user_options:
            if user_options['cn_num_perm'].isdigit():
                gp_parameters['num.permutations'] = str(
                    user_options['cn_num_perm'])

        if 'cn_user_pval' in user_options:
            if module_utils.is_number(user_options['cn_user_pval']):
                gp_parameters['user.pval'] = str(user_options['cn_user_pval'])

        mean_median = {'mean': '', 'median': '-d'}
        if out_attributes['cn_mean_or_median'] in ['mean', 'median']:
            gp_parameters['mean.or.median'] = mean_median[
                out_attributes['cn_mean_or_median']]

        p = {'t_test': '', 'snr': '-S'}
        if out_attributes['cn_ttest_or_snr'] in p.values():
            gp_parameters['ttest.or.snr'] = p[
                out_attributes['cn_ttest_or_snr']]

        if out_attributes['cn_filter_data'] in ['yes', 'no']:
            gp_parameters['filter.data'] = str(
                out_attributes['cn_filter_data'])

        if 'cn_abs_diff' in user_options:
            if module_utils.is_number(user_options['cn_abs_diff']):
                gp_parameters['min.abs.diff'] = str(
                    user_options['cn_abs_diff'])

        if 'cn_min_threshold' in user_options:
            if module_utils.is_number(user_options['cn_min_threshold']):
                gp_parameters['min.threshold'] = str(
                    user_options['cn_min_threshold'])

        if 'cn_max_threshold' in user_options:
            if module_utils.is_number(user_options['cn_max_threshold']):
                gp_parameters['max.threshold'] = str(
                    user_options['cn_max_threshold'])

        if 'cn_min_folddiff' in user_options:
            if module_utils.is_number(user_options['cn_min_folddiff']):
                gp_parameters['min.fold.diff'] = str(
                    user_options['cn_min_folddiff'])

        gp_path = config.genepattern
        gp_module = module_utils.which(gp_path)
        assert gp_module, 'cannot find the %s' % gp_path
        download_directory = os.path.join(".", 'class_neighbors_result')
        command = [gp_module, module_name, '-o', download_directory]
        for key in gp_parameters.keys():
            a = ['--parameters', key + ':' + gp_parameters[key]]
            command.extend(a)

        process = subprocess.Popen(command,
                                   shell=False,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        process.wait()
        error_message = process.communicate()[1]
        if error_message:
            raise ValueError(error_message)

        assert os.path.exists(download_directory), (
            'there is no output directory for class_neighbors')
        result_files = os.listdir(download_directory)
        assert 'stderr.txt' not in result_files, 'gene_pattern get error'
        os.remove(tmp)
        gene_list = []
        for result_file in result_files:
            if result_file.endswith('.odf'):
                f = file(os.path.join(download_directory, result_file), 'r')
                text = f.read()
                text = text.split('\n')
                f.close()
                numline = 8
                startline = 14
                assert text[numline].startswith(
                    'NumNeighbors'), 'the odf file format is not right'
                number_gene = int(text[numline].split('=')[1])
                assert text[startline].startswith(
                    '1'), 'the start line is not right'

                for line in text[startline:startline + number_gene]:
                    lines = line.split('\t')
                    gene_list.append(lines[10])

        f = file(outfile, 'w')
        f.write('\t'.join(gene_list))
        f.close()
        assert filelib.exists_nz(outfile), (
            'the output file %s for rank_genes_by_class_neighbors fails' %
            outfile)
Esempio n. 11
0
def main():
    import os
    import argparse
    import subprocess
    import StringIO
    import zipfile
    import shutil

    import arrayio
    from genomicode import config
    from genomicode import arraysetlib

    parser = argparse.ArgumentParser(description="Do a GSEA analysis.")
    parser.add_argument("expression_file", help="Gene expression file.")
    parser.add_argument("outpath", help="Where to save the files.")
    parser.add_argument("--clobber",
                        default=False,
                        action="store_true",
                        help="Overwrite outpath, if it already exists.")
    parser.add_argument("--dry_run",
                        default=False,
                        action="store_true",
                        help="Set up the file, but do not run GSEA.")

    group = parser.add_argument_group(title="Class Labels")
    group.add_argument("--cls_file", default=None, help="Class label file.")
    group.add_argument("--indexes1",
                       default=None,
                       help="Which columns in group 1, E.g. 1-5,8 (1-based, "
                       "inclusive).")
    group.add_argument(
        "--indexes2",
        default=None,
        help="(OPTIONAL) Which columns in group 2.  If not given, then "
        "will use any sample not included in --indexes1.")
    group.add_argument(
        "--indexes_include_headers",
        "--iih",
        action="store_true",
        help="If not given (default), then column 1 is the first column "
        "with data.  If given, then column 1 is the very first column in "
        "the file, including the headers.")
    group.add_argument("--name1", default=None, help="Name for group 1.")
    group.add_argument("--name2", default=None, help="Name for group 2.")

    group = parser.add_argument_group(title="Other Parameters")
    group.add_argument(
        "--platform",
        default=None,
        help="The platform (GenePattern chip) of the expression data, "
        "e.g. HG_U133A_2.chip.  You should leave this blank if the IDs "
        "in the gene expression data set are gene symbols.  Allowed "
        "values can be found on the GenePattern/GSEA website.")
    group.add_argument(
        '--min_match_score',
        default=0.95,
        type=float,
        help="When trying to identify the rows of a matrix or geneset, "
        "require at least this portion of the IDs to be recognized.")
    x = sorted(DATABASE2GENESET)
    x = [x.replace(DEFAULT_DATABASE, "%s (DEFAULT)" % x) for x in x]
    x = ", ".join(x)
    x = "Which database to search.  Possible values are: %s." % x
    group.add_argument("--database", default=DEFAULT_DATABASE, help=x)
    group.add_argument(
        "--database_file",
        default=None,
        help="Search a GMT or GMX file instead of the default databases.")
    group.add_argument(
        "--no_collapse_dataset",
        default=False,
        action="store_true",
        help="Do not 1) convert gene IDs to gene symbols, and do not 2) "
        "collapse duplicate gene symbols.  Set this if the gene IDs are "
        "already unique gene symbols.  Also, can use this if you "
        "provide the database_file and the gene IDs match the ones "
        "in our gene expression file.")

    # phenotype is more accurate.  But if only 2 samples, need to be
    # gene_set.  (Not sure about 3 samples?  Where is the limit?)
    group.add_argument(
        "--permutation_type",
        default="phenotype",
        choices=["phenotype", "gene_set"],
        help="Default is phenotype.  With <= 6 samples, recommend using "
        "gene_set instead.")

    args = parser.parse_args()
    assert os.path.exists(args.expression_file), \
        "File not found: %s" % args.expression_file

    assert type(args.min_match_score) is type(0.0)
    assert args.min_match_score > 0.2, "min_match_score too low"
    assert args.min_match_score <= 1.0, "min_match_score too high"

    # Must have either the indexes or the cls_file, but not both.
    assert args.cls_file or args.indexes1, (
        "Must provide either CLS file or the indexes for one group.")
    assert not (args.cls_file and args.indexes1), (
        "Cannot provide both a CLS file and the indexes.")
    assert not (args.cls_file and args.indexes2), (
        "Cannot provide both a CLS file and the indexes.")
    assert not (args.indexes2 and not args.indexes1)
    if args.cls_file:
        assert os.path.exists(args.cls_file), \
            "File not found: %s" % args.cls_file
        assert not args.name1
        assert not args.name2
    assert args.outpath, "Please specify an outpath."
    assert not os.path.exists(args.outpath) or args.clobber, \
        "Outpath %s already exists." % args.outpath
    if os.path.exists(args.outpath):
        shutil.rmtree(args.outpath)
    os.mkdir(args.outpath)

    MATRIX = arrayio.read(args.expression_file)

    # Make a CLS file, if necessary.
    if args.cls_file:
        names, classes = arraysetlib.read_cls_file(args.cls_file)
        assert len(names) == 2, "I must have 2 classes."
        name1, name2 = names
    else:
        x = arraysetlib.resolve_classes(MATRIX, args.indexes1, args.indexes2,
                                        args.indexes_include_headers,
                                        args.name1, args.name2)
        name1, name2, classes = x

    x = fix_class_order(MATRIX, name1, name2, classes)
    MATRIX, name1, name2, classes = x

    # Remove samples that aren't in any classes.
    for c in classes:
        assert c in [0, 1, None]
    I = [i for (i, x) in enumerate(classes) if x is not None]
    classes = [classes[i] for i in I]
    MATRIX = MATRIX.matrix(None, I)

    handle = StringIO.StringIO()
    arraysetlib.write_cls_file(handle, name1, name2, classes)
    cls_data = handle.getvalue()

    # Convert the format after making CLS file, or else args.indexes1
    # with args.indexes_include_headers might be off.
    # BUG: What if the conversion to GCT discards the proper platform
    # of this matrix?
    MATRIX = arrayio.convert(MATRIX, to_format=arrayio.gct_format)

    database_file = None
    if args.database_file:
        database_file = os.path.realpath(args.database_file)
        assert os.path.exists(database_file), ("I could not find file: %s" %
                                               database_file)
    # Required, even if gene.sets.database.file is given.
    gene_set_database = format_gene_set_database(args.database)

    # If no database file is given, then we need to know the platform
    # for the expression data.  (If one is given, we let the user make
    # sure the platforms match.)
    platform = args.platform
    if platform is None and not database_file and not args.no_collapse_dataset:
        platform = guess_chip_platform(MATRIX, args.min_match_score)
        # If gene symbols already provided, then turn off collapse_dataset.
        if platform is None:  # Gene Symbol
            args.no_collapse_dataset = True

    # Do some sanity checking to make sure imputs are reasonable.
    MATRIX = check_matrix(MATRIX)
    check_classes(classes, args.permutation_type)

    # Set up file names.
    opj = os.path.join
    x = os.path.split(args.expression_file)[1]
    if x.lower().endswith(".gz"):
        x = x[:-3]
    x = os.path.splitext(x)[0]
    x = x.replace(" ", "_")  # GenePattern cannot work with spaces.
    assert x, "empty file name"
    gct_file = "%s.gct" % x
    cls_file = "%s.cls" % x
    out_file = "%s.zip" % x  # GenePattern saves output to <file>.zip.
    gct_full = opj(args.outpath, gct_file)
    cls_full = opj(args.outpath, cls_file)
    out_full = opj(args.outpath, out_file)
    if database_file:
        db_file = os.path.split(database_file)[1]
        db_full = opj(args.outpath, db_file)

    # Write the gene expression, class label, and database files.  It
    # is better to have local copies of the files.  It is unclear how
    # to upload files to GenePattern if the file names have spaces in
    # them.  Get around this by making all the files local.
    arrayio.gct_format.write(MATRIX, open(gct_full, 'w'))
    open(cls_full, 'w').write(cls_data)
    if database_file:
        open(db_full, 'w').write(open(database_file).read())

    collapse_dataset = "true"
    if args.no_collapse_dataset:
        collapse_dataset = "false"

    # Set up the analysis.
    params = {
        "expression.dataset": gct_file,
        "phenotype.labels": cls_file,
        "collapse.dataset": collapse_dataset,
        "permutation.type": args.permutation_type,
    }
    # platform is required, even if collapse.dataset is false.  If no
    # platform is given, then specify a default one.
    #CHIP_PLATFORM = "chip.platform"
    CHIP_PLATFORM = "chip.platform.file"
    params[CHIP_PLATFORM] = platform
    if params[CHIP_PLATFORM] is None:
        params[CHIP_PLATFORM] = "HG_U133A.chip"
    if database_file:
        params["gene.sets.database.file"] = db_file
    # Required, even if gene.sets.database.file is given.
    params["gene.sets.database"] = gene_set_database

    if args.dry_run:
        return

    cmd = [config.genepattern, "-o", ".", "GSEA"]
    for (key, value) in reversed(list(params.iteritems())):
        x = ["--parameters", "%s:%s" % (key, value)]
        cmd.extend(x)
    #print " ".join(cmd)
    #import sys; sys.exit(0)

    # Run the analysis in the outpath.  GSEA leaves a file
    # "System.out" in the current directory.
    cwd = os.getcwd()
    try:
        os.chdir(args.outpath)
        p = subprocess.Popen(cmd,
                             bufsize=0,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             close_fds=True)
        w, r = p.stdin, p.stdout
        w.close()
        # Check for errors in the output.
        x = r.read()
        # Get rid of GenePattern garbage.
        data = x.replace("Loading required package: rJava", "")
        p.wait()
    finally:
        os.chdir(cwd)
    x = data.strip()
    # rpy2 generates UserWarnings for some reason.
    # /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/
    #   python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning:
    #       res = super(Function, self).__call__(*new_args, **new_kwargs)
    if x.find("UserWarning") >= 0 and x.endswith("(*new_args, **new_kwargs)"):
        # Ignore this UserWarning.
        x = ""
    # Also ignore this RRuntimeWarning.
    # /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/
    #   python2.7/site-packages/rpy2/rinterface/__init__.py:185:
    #   RRuntimeWarning:
    #
    #   warnings.warn(x, RRuntimeWarning)
    if x.find("RRuntimeWarning") >= 0 and \
           x.endswith("warnings.warn(x, RRuntimeWarning)"):
        x = ""
    assert not x, "%s\n%s" % (" ".join(map(str, cmd)), data)

    error_file = os.path.join(args.outpath, "stderr.txt")
    assert not os.path.exists(error_file), (
        "Error generated by GenePattern:\n%s" % open(error_file).read())

    # Unzip the zipped results in the outpath.
    assert os.path.exists(out_full), "ERROR: Output file is missing [%s]." % \
        out_full
    zfile = zipfile.ZipFile(out_full)
    zfile.extractall(args.outpath)
    os.unlink(out_full)

    x = os.path.join(args.outpath, "index.html")
    assert os.path.exists(x), "I could not find the GSEA output: %s" % x
Esempio n. 12
0
def main():
    from optparse import OptionParser, OptionGroup

    usage = "usage: %prog [options] <bfrm_model> <dataset>"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("",
                      "--bfrm_path",
                      dest="bfrm_path",
                      default=None,
                      help="Specify the path to BFRM_project.")
    parser.add_option("",
                      "--matlab",
                      dest="matlab",
                      default="matlab",
                      help="Specify the command to run matlab.")
    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    # Parse the arguments.
    options, args = parser.parse_args()

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import this after the library path is set.
    import arrayio
    from genomicode import archive
    from genomicode import genepattern

    genepattern.fix_environ_path()

    if len(args) != 2:
        parser.error("Please specify files.")
    model_file, filename = args
    assert os.path.exists(model_file), "File not found: %s" % model_file
    assert os.path.exists(filename), "File not found: %s" % filename

    # Set up the files.
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read the matrix and convert to GCT format.
    x = arrayio.read(filename)
    MATRIX = arrayio.convert(x, to_format=arrayio.gct_format)
    print "Read data set with %d genes and %d samples." % (MATRIX.nrow(),
                                                           MATRIX.ncol())

    log_matrix(MATRIX)

    # Write out the data sets.
    write_dataset(file_layout.DATASET, MATRIX)

    # Save the BFRM model.
    write_model(model_file, file_layout)

    # Run BFRM projection.
    run_bfrm_project(file_layout, options.bfrm_path, options.matlab)

    # Generate output files.
    summarize_factor_scores(file_layout, options.python, options.arrayplot,
                            options.cluster, options.libpath)

    if options.archive:
        print "Archiving results."
        archive.zip_path(file_layout.ATTIC, noclobber=False)
        archive.zip_path(file_layout.BFRM, noclobber=False)

    print "Done."
Esempio n. 13
0
def main():
    from optparse import OptionParser, OptionGroup
    
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option(
        "-r", "--rma", dest="rma_dataset", type="string", default=None,
        help="Specify the RMA-normalized data to analyze.")
    parser.add_option(
        "-m", "--mas5", dest="mas5_dataset", type="string", default=None,
        help="Specify the MAS5-normalized data to analyze.")
    parser.add_option(
        "-i", "--illu", dest="illu_dataset", type="string", default=None,
        help="Specify the Illumina data to analyze.")
    parser.add_option(
        "", "--sigdb_path", dest="sigdb_path", type="string", default=None,
        help="Location of the sigdb/ directory.")
    parser.add_option(
        "", "--sigtag", dest="signature_tags", default=[], action="append",
        help="Specify a specific tag to use.")
    parser.add_option(
        "", "--sigid", dest="signature_ids", default=[], action="append",
        help="Specify a specific signature to use.")
    parser.add_option(
        "", "--max_signatures", dest="max_signatures", type="int",
        default=None,
        help="Maximum number of signatures to run (for DEBUGGING).")
    parser.add_option(
        "-j", "", dest="num_procs", type="int", default=1,
        help="Number of jobs to run in parallel.")
    parser.add_option(
        "-z", "", dest="archive", action="store_true", default=False,
        help="Archive the individual signatures.  Helpful for GenePattern.")
    parser.add_option(
        "", "--libpath", dest="libpath", action="append", default=[],
        help="Add to the Python library search path.")
    parser.add_option(
        "-o", "--outpath", dest="outpath", type="string", default=None,
        help="Save files in this path.")
    parser.add_option(
        "", "--gp_imod_all_vars", dest="gp_imod_all_vars", type="string",
        default=None,
        help="Special internal variable for use with GenePattern "
        "interactive modules.")
    parser.add_option(
        "", "--debug_gp_imod_all_vars", action="store_true", default=False, 
        dest="debug_gp_imod_all_vars",
        )
    
    #group = OptionGroup(parser, "Normalization")
    #group.add_option(
    #    "", "--normalization", dest="normalization", default="MAS5",
    #    help="How was the data set normalized (default MAS5).")
    #group.add_option(
    #    "-l", "--log_data", dest="log_data", action="store_true",
    #    default=False,
    #    help="Log the MAS5 data before analyzing.")
    #parser.add_option_group(group)

    group = OptionGroup(parser, "Pybinreg")
    group.add_option(
        "", "--python", dest="python", default=None,
        help="Specify the command to run python.")
    group.add_option(
        "", "--matlab", dest="matlab", default=None,
        help="Specify the command to run matlab.")
    group.add_option(
        "", "--povray", dest="povray", default=None,
        help="Specify the command to run povray.")
    group.add_option(
        "", "--cluster", dest="cluster", default=None,
        help="Specify the command to run cluster.")
    group.add_option(
        "", "--binreg", dest="binreg_path", default=None,
        help="Specify the path to the BinReg2.0 code.")
    group.add_option(
        "", "--pybinreg", dest="pybinreg", default=None,
        help="Specify the command to run pybinreg.py.")
    group.add_option(
        "", "--arrayplot", dest="arrayplot", default=None,
        help="Specify the command to run arrayplot.")
    parser.add_option_group(group)

    options, args = parser.parse_args()
    #if len(args) < 1:
    #    #print sys.argv
    #    #print len(args), args
    #    parser.error("Please specify sigdb_path.")
    #elif len(args) > 1:
    #    parser.error("Too many arguments.")
    if args:
        parser.error("Too many arguments.")

    # DEBUG the gp_imod_all_vars variable.
    if options.debug_gp_imod_all_vars:
        assert not options.gp_imod_all_vars
        options.gp_imod_all_vars = (
            "mas5_expression_file_cb=file&mas5_expression_file_url=&"
            "rma_expression_file_cb=file&rma_expression_file_url=&"
            # Skip AKT signature.
            "sig_AKT=no&"
            # Change BCAT normalization.
            "sig_BCAT=yes (custom parameters)&"
            "sig_BCAT_apply_quantile_normalization=no&"
            "sig_BCAT_apply_shiftscale_normalization=no&"
            "sig_BCAT_num_genes=85&sig_BCAT_num_metagenes=2&"
            # No changes in E2F1.
            "sig_E2F1=yes (custom parameters)&"
            "sig_E2F1_apply_quantile_normalization=yes&"
            "sig_E2F1_apply_shiftscale_normalization=yes&"
            "sig_E2F1_num_genes=150&sig_E2F1_num_metagenes=2&"
            # Change genes in EGFR.
            "sig_EGFR=yes (custom parameters)&"
            "sig_EGFR_apply_quantile_normalization=no&"
            "sig_EGFR_apply_shiftscale_normalization=yes&"
            #"sig_EGFR_num_genes=50000&sig_EGFR_num_metagenes=2&"
            "sig_EGFR_num_genes=501&sig_EGFR_num_metagenes=2&"
            # Change quantile, genes, metagenes in ER.
            "sig_ER=yes (custom parameters)&"
            "sig_ER_apply_quantile_normalization=no&"
            "sig_ER_apply_shiftscale_normalization=yes&"
            "sig_ER_num_genes=150&sig_ER_num_metagenes=3&"
            "sig_HER2=yes (default parameters)&"
            "sig_IFNalpha=yes (default parameters)&"
            "sig_IFNgamma=yes (default parameters)&"
            "sig_MYC=yes (default parameters)&"
            "sig_P53=yes (default parameters)&"
            "sig_P63=yes (default parameters)&"
            "sig_PI3K=yes (default parameters)&"
            "sig_PR=yes (default parameters)&"
            "sig_RAS=yes (default parameters)&"
            "sig_SRC=yes (default parameters)&"
            "sig_STAT3=yes (default parameters)&"
            "sig_TGFB=yes (default parameters)&"
            "sig_TNFa=yes (default parameters)&"
            "which_signatures=I choose myself"
            )
        
    datafile_rma = datafile_mas5 = datafile_illu = None
    if options.rma_dataset is not None:
        assert os.path.exists(options.rma_dataset), \
               "RMA file not found: %s" % options.rma_dataset
        datafile_rma = os.path.realpath(options.rma_dataset)
    if options.mas5_dataset is not None:
        assert os.path.exists(options.mas5_dataset), \
               "MAS5 file not found: %s" % options.mas5_dataset
        datafile_mas5 = os.path.realpath(options.mas5_dataset)
    if options.illu_dataset is not None:
        assert os.path.exists(options.illu_dataset), \
               "ILLU file not found: %s" % options.illu_dataset
        datafile_illu = os.path.realpath(options.illu_dataset)
    assert datafile_rma or datafile_mas5 or datafile_illu, \
           "Please specify at least one data set."

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import after the library path is set.
    import time
    import arrayio
    from genomicode import config
    from genomicode import parallel
    from genomicode import archive
    from genomicode import hashlib
    from genomicode import matrixlib
    from genomicode import genepattern
    
    #sigdb_path, = args
    x = options.sigdb_path or config.sigdb_path
    sigdb_path = os.path.realpath(x)
    assert os.path.exists(sigdb_path), \
           "I could not find the signatures database: %s." % sigdb_path

    start_time = time.time()
    
    genepattern.fix_environ_path()
    
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read the signatures and select the ones to score.
    # BUG: Should allow this to be specified on the command line.
    desired_tags = ["Pathway"]  # default
    if options.signature_tags:
        desired_tags = options.signature_tags[:]
    all_normalization = ["RMA", "MAS5", "ILLU"]
    desired_normalization = []
    if datafile_rma is not None:   # RMA datafile is specified.
        desired_normalization.append("RMA")
    if datafile_mas5 is not None:  # MAS5 datafile is specified.
        desired_normalization.append("MAS5")
    if datafile_illu is not None:  # ILLU datafile is specified.
        desired_normalization.append("ILLU")
        
    # If any signature IDs are specified, then use only those IDs and
    # ignore the desired tags.
    print "Reading signature database: %s." % sigdb_path
    desired_ids = []
    if options.signature_ids:
        desired_ids = options.signature_ids[:]
    x = read_signatures(
        sigdb_path, all_normalization, desired_ids, desired_tags)
    signatures = x
    orig_signatures = signatures[:]

    # Filter for just the normalization that we have data files for.
    # Keep track of why we filtered out certain signatures.
    why_dropped = {}  # ID -> explanation as string
    good = []
    for sig in signatures:
        if sig.Normalization.upper() in desired_normalization:
            good.append(sig)
            continue
        x = "Signature requires %s normalized data, but it was not provided."%(
            sig.Normalization.upper())
        why_dropped[sig.xID] = x
    signatures = good
    assert signatures, "No signatures available."

    # Process additional parameters from GenePattern.
    # o Do this before max_signatures, so that the maximum signatures
    #   is selected only out of the ones that the user specified.
    # o Do this before names and paths, so the variables will be
    #   aligned.
    # gp_imod_all_vars can be None or "".
    if options.gp_imod_all_vars:
        x = process_gp_imod_all_vars(
            options.gp_imod_all_vars, signatures, why_dropped)
        signatures, why_dropped = x

    sys.stdout.flush()
    DATA_rma = DATA_mas5 = DATA_illu = None
    if datafile_rma is not None:
        print "Reading RMA file: %s" % datafile_rma
        DATA_rma = arrayio.read(datafile_rma)
        DATA_rma = arrayio.convert(DATA_rma, to_format=arrayio.gct_format)
    if datafile_mas5 is not None:
        print "Reading MAS5 file: %s" % datafile_mas5
        DATA_mas5 = arrayio.read(datafile_mas5)
        DATA_mas5 = arrayio.convert(DATA_mas5, to_format=arrayio.gct_format)
    if datafile_illu is not None:
        print "Reading ILLU file: %s" % datafile_illu
        DATA_illu = arrayio.read(datafile_illu)
        DATA_illu = arrayio.convert(DATA_illu, to_format=arrayio.gct_format)
    # Don't handle the log.  Let pybinreg do it.
    # Make sure the data sets contain the same samples.  Align them if
    # necessary.
    DATA_all = [
        ("DATA_rma", DATA_rma), ("DATA_mas5", DATA_mas5),
        ("DATA_illu", DATA_illu)]
    DATA_all = [x for x in DATA_all if x[1]]
    for i in range(1, len(DATA_all)):
        key1, data1 = DATA_all[0]
        key2, data2 = DATA_all[i]
        assert key1 != key2
        assert data1 and data2
        assert data1.ncol() == data2.ncol(), \
               "%s and %s data sets have different numbers of samples." % (
            key1, key2)
        if matrixlib.are_cols_aligned(data1, data2):
            continue
        x = matrixlib.align_cols(data1, data2)
        data1_new, data2_new = x
        assert matrixlib.are_cols_aligned(data1_new, data2_new)
        # The samples in data1 (the reference) should not be changed.
        assert data1.ncol() == data1_new.ncol(), \
               "%s and %s data sets have different samples" % (
            key1, key2)
        assert matrixlib.are_cols_aligned(data1, data1_new)
        DATA_all[i] = key2, data2_new
    for key, data in DATA_all:
        if key == "DATA_rma":
            DATA_rma = data
        elif key == "DATA_mas5":
            DATA_mas5 = data
        elif key == "DATA_illu":
            DATA_illu = data
        else:
            raise AssertionError, "Unknown key: %s" % key
    print "Writing aligned signal files."
    if DATA_rma:
        arrayio.gct_format.write(
            DATA_rma, open(file_layout.DATASET_RMA, 'w'))
    if DATA_mas5:
        arrayio.gct_format.write(
            DATA_mas5, open(file_layout.DATASET_MAS5, 'w'))
    if DATA_illu:
        arrayio.gct_format.write(
            DATA_illu, open(file_layout.DATASET_ILLU, 'w'))

    # Figure out the names and paths for each signature.
    print "Finding signatures."
    names = [None] * len(signatures)   # SIG19_AKT[_modified]
    paths = [None] * len(signatures)   # <path>/SIG19_AKT[_modified]
    for i, sig in enumerate(signatures):
        name = "SIG%02d_%s" % (sig.xID, hashlib.hash_var(sig.Name))
        # If the user has modified the signature from the default
        # parameters, then make a note of it.
        if getattr(sig, "Changed", False):
            name = "%s_modified" % name
        outpath = os.path.join(file_layout.OUTPATH, name)
        names[i] = name
        paths[i] = outpath

    if options.max_signatures is not None:
        signatures = signatures[:options.max_signatures]

    # Make a list of the jobs.
    jobs = []  # list of cmd, outpath, outfile
    for i, sig in enumerate(signatures):
        name, outpath = names[i], paths[i]
        #print "Generating signature %s [%d:%d]" % (
        #    name, i+1, len(signatures))
        #sys.stdout.flush()
        
        quantile_normalize = False
        assert sig.Quantile.upper() in ["YES", "NO"]
        if sig.Quantile.upper() == "YES":
            quantile_normalize = True
        shift_scale_normalize = False
        assert sig.Shift_Scale.upper() in ["YES", "NO"]
        if sig.Shift_Scale.upper() == "YES":
            shift_scale_normalize = True
        
        #outfile = os.path.join(files.outpath, "%s.out.txt" % name)
        outfile = os.path.join(outpath, "out.txt")

        if sig.Normalization.upper() == "RMA":
            datafile = file_layout.DATASET_RMA
            assert DATA_rma
        elif sig.Normalization.upper() == "MAS5":
            datafile = file_layout.DATASET_MAS5
            assert DATA_mas5
        elif sig.Normalization.upper() == "ILLU":
            datafile = file_layout.DATASET_ILLU
            assert DATA_illu
        else:
            raise AssertionError, "Unknown normalization."

        # If the entire analysis should be archived, then go ahead and
        # archive each of the pybinreg runs too.  This will prevent
        # large analyses from taking up too much disk space.  The
        # drawback is that the files that are archived are no longer
        # available for use here.  Hopefully this won't be a problem.
        cmd = make_pybinreg_cmd(
            options.pybinreg, options.python, options.binreg_path,
            options.matlab, options.arrayplot, options.povray,
            options.cluster, options.libpath,
            outpath, options.archive, sig.Genes, sig.Metagenes,
            quantile_normalize, shift_scale_normalize,
            sig.Train0, sig.Train1, datafile)
        x = cmd, outpath, outfile
        jobs.append(x)

    # Run each of the jobs.
    if options.num_procs < 1 or options.num_procs > 100:
        parser.error("Please specify between 1 and 100 processes.")
    if options.num_procs > 1:
        if parallel._find_parallel():
            num_sigs = min(options.num_procs, len(jobs))
            if num_sigs > 1:
                print "Predicting %d signatures at a time." % num_sigs
        else:
            print("I could not find GNU parallel.  "
                  "Predicting 1 signature at a time.")
            options.num_procs = 1
        sys.stdout.flush()

    DEBUG = False   # Can disable pybinreg temporarily for debugging.
    if not DEBUG:  
        if options.num_procs <= 1:
            for x in jobs:
                cmd, outpath, outfile = x
                run_one_pybinreg(cmd, outpath, outfile)
        else:
            run_many_pybinreg(jobs, options.num_procs)

    if signatures:
        print "Extracting the reports from each signature."
        report_files = extract_reports(names, paths, file_layout)
        
        print "Combining probabilities from each of the signatures."
        summarize_probabilities(signatures, names, paths, file_layout)

        print "Making heatmap of the results."
        sys.stdout.flush()
        summarize_heatmap(
            options.python, options.arrayplot, options.cluster,
            options.libpath, file_layout)

        print "Summarizing signatures."
        summarize_signatures(signatures, file_layout)

        print "Making a report."
        analysis_name = make_analysis_name(options)
        summarize_report(
            analysis_name, signatures, orig_signatures, report_files,
            start_time, why_dropped, file_layout)

    if options.archive:
        print "Compressing results."
        sys.stdout.flush()
        archive.zip_path(file_layout.ATTIC)
        for i, sig in enumerate(signatures):
            name, outpath = names[i], paths[i]
            archive.zip_path(outpath)
    
    print "Done."
Esempio n. 14
0
def main():
    from optparse import OptionParser, OptionGroup

    usage = "usage: %prog [options] <dataset>"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--bfrm_bin",
                      dest="bfrm_bin",
                      default=None,
                      help="Specify the path to the BFRM binary.")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    group = OptionGroup(parser, "Filtering")
    group.add_option(
        "--filter_mean",
        dest="filter_mean",
        type=float,
        default=None,
        help="Remove this portion of genes based on mean expression.")
    group.add_option("--filter_var",
                     dest="filter_var",
                     type=float,
                     default=None,
                     help="Remove this portion of genes based on variance.")
    group.add_option("--cutoff",
                     dest="cutoff",
                     type=float,
                     default=0.99,
                     help="Cutoff probability for a gene to be in a factor.")
    parser.add_option_group(group)

    group = OptionGroup(parser, "BFRM Parameters")
    group.add_option("--nc",
                     dest="num_control_vars",
                     type="int",
                     default=None,
                     help="Specify the number of control variables to use.")
    group.add_option(
        "--num_factors",
        dest="num_factors",
        type="int",
        default=None,
        help="The number of factors to fit.  "
        "For evolutionary search, starts with this number of factors.")
    group.add_option(
        "--design_file",
        dest="design_file",
        default=None,
        help="A file containing a matrix with additional design variables.")
    group.add_option(
        "--nucleus_file",
        dest="nucleus_file",
        default=None,
        help="A file that contains the genes to start the evolution.  "
        "This should be a text file that contains a whitespace-separated "
        "list of genes.  If this or --nucleus_geneset is given, "
        "the evolutionary search will be turned on.")
    group.add_option(
        "--nucleus_geneset",
        dest="nucleus_geneset",
        default=None,
        help="A gene set that contains the genes to start the evolution.  "
        "Format: <gmx/gmt_file>[,<geneset>,<geneset>,...]")
    group.add_option("--evol_max_factors",
                     dest="evol_max_factors",
                     default=None,
                     help="Maximum number of factors for the evolution.")
    group.add_option("--evol_max_genes",
                     dest="evol_max_genes",
                     default=None,
                     help="Maximum number of genes for the evolution.")
    parser.add_option_group(group)

    # Parse the arguments.
    options, args = parser.parse_args()

    if options.cutoff <= 0 or options.cutoff > 1:
        parser.error("Cutoff probability should be between 0 and 1.")
    if options.filter_mean and (options.filter_mean < 0
                                or options.filter_mean >= 1):
        parser.error("filter_mean filter should be between 0 and 1.")
    if options.filter_var and (options.filter_var < 0
                               or options.filter_var >= 1):
        parser.error("filter_var filter should be between 0 and 1.")

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import this after the library path is set.
    import arrayio
    from genomicode import archive
    from genomicode import genepattern

    genepattern.fix_environ_path()

    if len(args) != 1:
        parser.error("Please specify a file to factor.")
    filename, = args
    assert os.path.exists(filename), "File not found: %s" % filename

    if options.nucleus_file and options.nucleus_geneset:
        parser.error("Please specify either nucleus_file or nucleus_geneset.")
    nucleus = None
    if options.nucleus_file:
        nucleus = _read_nucleus_file(options.nucleus_file)
    elif options.nucleus_geneset:
        nucleus = _read_nucleus_geneset(options.nucleus_geneset)

    # Not sure if this is necessary.  Don't know if BFRM will provide
    # a default if not given.
    if nucleus:
        assert options.num_factors, "Please specify number of factors."

    # Set up the files.
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read the matrix and convert to GCT format.
    x = arrayio.read(filename)
    MATRIX_orig = arrayio.convert(x, to_format=arrayio.gct_format)
    print "Read data set with %d genes and %d samples." % (MATRIX_orig.nrow(),
                                                           MATRIX_orig.ncol())

    # Make a copy so that in-place changes (like log_matrix) won't
    # affect the original matrix.
    MATRIX = MATRIX_orig.matrix()

    # Log the data set if necessary.
    log_matrix(MATRIX)

    # Filter out based on mean and varian
    MATRIX = filter_dataset(MATRIX, options.filter_mean, options.filter_var)
    if MATRIX.nrow() != MATRIX_orig.nrow():
        print "Filtered from %d genes to %d." % (MATRIX_orig.nrow(),
                                                 MATRIX.nrow())

    # Write out the data sets.
    write_dataset(file_layout.DATASET_ORIG, MATRIX_orig)
    write_dataset(file_layout.DATASET, MATRIX)

    # Run BFRM.
    DEBUG = False
    if not DEBUG:
        run_bfrm(file_layout, options.bfrm_bin, options.num_control_vars,
                 options.num_factors, options.design_file, nucleus,
                 options.evol_max_factors, options.evol_max_genes)

    # Generate output files.
    summarize_factor_scores(file_layout, options.cutoff, options.python,
                            options.arrayplot, options.cluster,
                            options.libpath)
    summarize_gene_factor_probs(file_layout, options.cutoff, options.python,
                                options.arrayplot, options.cluster,
                                options.libpath)
    summarize_factor_geneset(file_layout, options.cutoff)

    # BFRM model file should always be archived.
    archive.zip_path(file_layout.BFRM, noclobber=False)

    if options.archive:
        print "Archiving results."
        archive.zip_path(file_layout.ATTIC, noclobber=False)

    print "Done."
Esempio n. 15
0
def main():
    from optparse import OptionParser, OptionGroup

    # matrix_file should be a pathway x sample file.
    usage = "usage: %prog [options] <dataset>"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("",
                      "--selap",
                      dest="selap_path",
                      default=None,
                      help="Specify the path to SELAPv3.")
    parser.add_option("",
                      "--matlab",
                      dest="matlab",
                      default="matlab",
                      help="Specify the command to run matlab.")
    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    # This doesn't give as much control over exactly which python
    # version is run.
    #parser.add_option(
    #    "", "--binpath", dest="binpath", action="append", default=[],
    #    help="Add to the binary search path.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    group = OptionGroup(parser, "Model Parameters")
    # Higher numbers have more groups.
    # Range from 0 and lower.
    group.add_option(
        "-p",
        "--penalty",
        dest="penalty",
        default="-33",
        help="Penalty for tuning number of subgroups (default -33).")
    group.add_option(
        "-m",
        "--model",
        dest="model_file",
        default=None,
        help="Specify a file that contains a pre-built subtype model.")
    parser.add_option_group(group)

    # Parse the input arguments.
    options, args = parser.parse_args()
    if len(args) != 1:
        parser.error("Please specify a file with pathway probabilities.")
    filename, = args
    if not os.path.exists(filename):
        parser.error("I could not find file %s." % filename)

    if options.penalty.find(".") >= 0:
        parser.error("Penalties should be integers.")

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import after the library path is set.
    import arrayio
    from genomicode import genepattern
    from genomicode import archive
    from genomicode import parselib

    genepattern.fix_environ_path()

    # Maximum number of models that someone can create at a time.
    MAX_MODELS = 50

    # Allow people to supply more than one penalty.  Parse into a list
    # of ranges.  Penalties must be integers.
    penalties = []
    for (start, end) in parselib.parse_ranges(options.penalty):
        penalties.extend(range(start, end + 1))
    assert len(penalties) <= MAX_MODELS, "Too many penalties (max is %d)." % \
           MAX_MODELS
    assert penalties, "At least one penalty must be specified."
    assert not (options.model_file and len(penalties) != 1)
    for p in penalties:
        assert p <= 0, "Penalties should be negative."

    num_analyses = len(penalties)

    # Set up the files.
    file_layout = make_file_layout(options.outpath, num_analyses, penalties[0])
    init_paths(file_layout)

    # Read the matrix and convert to GCT format.
    MATRIX = arrayio.read(filename)
    MATRIX = arrayio.convert(MATRIX, to_format=arrayio.gct_format)

    # Align this matrix to the SELAP model, if it already exists.
    if options.model_file:
        MATRIX = align_dataset(MATRIX, options.model_file)
    # Write out the data set.
    write_dataset(file_layout.DATASET, MATRIX)

    for penalty in penalties:
        # Set up the files.
        file_layout = make_file_layout(options.outpath, num_analyses, penalty)
        init_paths(file_layout)

        # Make the model.
        write_selap_dataset(file_layout)
        if options.model_file:
            write_model(options.model_file, file_layout)
        else:
            make_model(options.selap_path, penalty, file_layout,
                       options.matlab)

        # Predict the subgroups.
        predict_subgroups(options.selap_path, file_layout, options.matlab)

        # Generate some files for output.
        summarize_predictions(file_layout)
        summarize_heatmap(options.python, options.arrayplot, options.cluster,
                          file_layout, options.libpath)

        # Archive the SELAP stuff, and any other big files.
        if options.archive:
            print "Archiving results."
            archive.zip_path(file_layout.SELAP, noclobber=False)
            archive.zip_path(file_layout.ATTIC, noclobber=False)

        if num_analyses <= 1:
            continue
        # Now do some cleanup if multiple analyses were requested.

        # If there were multiple penalties specified, make a copy of
        # some files for convenience.
        fl = file_layout
        files_to_copy = [
            (fl.PREDICTIONS_PCL, fl.GLOBAL_PREDICTIONS_PCL),
            (fl.PREDICTIONS_PNG, fl.GLOBAL_PREDICTIONS_PNG),
        ]
        for src, dst in files_to_copy:
            assert os.path.exists(src)
            os.system("cp -p '%s' '%s'" % (src, dst))

        if options.archive:
            archive.zip_path(file_layout.ANALYSIS)
        sys.stdout.flush()

    if num_analyses > 1:
        summarize_subgroups(options.outpath, num_analyses, penalties)

    print "Done."
Esempio n. 16
0
def test_format_conversion():
    import StringIO
    import arrayio
    
    file_jeff = "samples/0159_cl.small.rma"
    file_pcl = "samples/0159_cl.small.pcl"
    file_gct = "samples/0159_cl.small.gct"

    # Test choose_format.
    fmt = arrayio.choose_format(file_jeff)
    test(print_fn, (fmt.__name__,), {}, "arrayio.jeffs_format")
    fmt = arrayio.choose_format(file_pcl)
    test(print_fn, (fmt.__name__,), {}, "arrayio.pcl_format")
    fmt = arrayio.choose_format(file_gct)
    test(print_fn, (fmt.__name__,), {}, "arrayio.gct_format")

    # Test guess_format.
    X_jeff = arrayio.read(file_jeff, datatype=None)
    X_pcl = arrayio.read(file_pcl, datatype=None)
    X_gct = arrayio.read(file_gct, datatype=None)
    fmt = arrayio.guess_format(X_jeff)
    test(print_fn, (fmt.__name__,), {}, "arrayio.jeffs_format")
    fmt = arrayio.guess_format(X_pcl)
    test(print_fn, (fmt.__name__,), {}, "arrayio.pcl_format")
    fmt = arrayio.guess_format(X_gct)
    test(print_fn, (fmt.__name__,), {}, "arrayio.gct_format")
    
    # Read the matrix.  No format conversion, or float conversion
    # might mess things up.
    X_jeff = arrayio.read(file_jeff, datatype=None)

    # Test convert.
    # _jeff_to_pcl
    handle = StringIO.StringIO()
    X_pcl = arrayio.convert(X_jeff, to_format=arrayio.pcl_format)
    arrayio.pcl_format.write(X_pcl, handle)
    #test(handle.getvalue, (), {}, open(file_pcl).read())
    
    #_jeff_to_gct
    handle = StringIO.StringIO()
    X_gct = arrayio.convert(X_jeff, to_format=arrayio.gct_format)
    arrayio.gct_format.write(X_gct, handle)
    #print handle.getvalue(),
    #test(handle.getvalue, (), {}, open(file_gct).read())

    #_gct_to_pcl
    handle = StringIO.StringIO()
    X_pcl = arrayio.convert(X_gct, to_format=arrayio.pcl_format)
    arrayio.pcl_format.write(X_pcl, handle)
    # _gct_to_pcl changes the gene id to "GeneID".  Fix this so that
    # we can compare against the gold standard file.  Everything else
    # should be the same.
    x = handle.getvalue()
    x = x.replace("GeneID", "Probe.Set.ID")
    test(print_fn, (x,), {}, open(file_pcl).read())
    
    #_pcl_to_gct
    handle = StringIO.StringIO()
    X_pcl = arrayio.read(file_pcl, datatype=None)
    X_gct = arrayio.convert(X_pcl, to_format=arrayio.gct_format)
    arrayio.gct_format.write(X_gct, handle)
    test(handle.getvalue, (), {}, open(file_gct).read())
    
    #_tdf_to_gct
    handle = StringIO.StringIO()
    X_pcl = arrayio.read(file_pcl, datatype=None)
    X_gct = arrayio.convert(
        X_pcl, from_format=arrayio.tab_delimited_format,
        to_format=arrayio.gct_format)
    arrayio.gct_format.write(X_gct, handle)
    test(handle.getvalue, (), {}, open(file_gct).read())