def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): """log the input file""" import math import arrayio from genomicode import filelib from genomicode import binreg signal_file = in_data.identifier filelib.assert_exists_nz(signal_file) M = arrayio.read(signal_file) assert not binreg.is_logged_array_data(M), 'the file is logged' # Change the matrix in place. X = M._X for i in range(len(X)): for j in range(len(X[i])): x = X[i][j] if x is None: continue x = float(x) if x < 1: x = 1 x = math.log(x, 2) X[i][j] = x M_c = arrayio.convert(M, to_format=arrayio.tab_delimited_format) handle = open(outfile, 'w') arrayio.tab_delimited_format.write(M_c, handle)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): """check an input file is xls or xlsx format""" import arrayio in_filename = in_data.identifier # Why is this necessary? #try: # x = userfile._unhash_storefile(in_data.identifier) # real_name = x[1] #except: # pass #if (in_data.identifier.endswith('.gz') or in_filename.endswith('.gz')): # unzip_file = module_utils.gunzip(in_data.identifier) #else: # unzip_file = in_data.identifier ## M = None ## xls_file = None ## txt_file = unzip_file ## try: ## xlrd.open_workbook(unzip_file) ## xls_file = 'tmp.xls' ## # XLRDError? Is this a bug? This is not the way to catch exception. ## except Exception, XLRDError: ## try: ## # Test this. book not used? ## book = openpyxl.load_workbook(unzip_file) ## xls_file = 'tmp.xlsx' ## except Exception, InvalidFileException: ## xls_file = None ## except (SystemError, MemoryError, KeyError), x: ## raise ## if xls_file: ## shutil.copyfile(unzip_file, xls_file) ## xls2txt_path = config.xls2txt ## xls2txt_BIN = module_utils.which(xls2txt_path) ## assert xls2txt_BIN, 'cannot find the %s' % xls2txt_path ## f = file('tmp1.txt', 'w') ## command = ['python', xls2txt_BIN, xls_file] ## process = subprocess.Popen(command, ## shell=False, ## stdout=f, ## stderr=subprocess.PIPE) ## error_message = process.communicate()[1] ## if error_message: ## raise ValueError(error_message) ## os.remove(xls_file) ## f.close() ## txt_file = 'tmp1.txt' to_format = arrayio.tdf MATRIX = arrayio.read(in_filename) MATRIX_c = arrayio.convert(MATRIX, to_format=to_format) to_format.write(MATRIX_c, open(outfile, 'w'))
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import quantnorm import arrayio from genomicode import filelib in_data = antecedents M = arrayio.read(in_data.identifier) Y = quantnorm.normalize(M) f = file(outfile, 'w') Y_c = arrayio.convert(Y, to_format=arrayio.pcl_format) arrayio.tab_delimited_format.write(Y_c, f) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for quantile fails' % outfile )
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess import arrayio from Betsy import module_utils from genomicode import filelib from genomicode import config in_data = antecedents bfrm_path = config.bfrmnorm bfrm_BIN = module_utils.which(bfrm_path) assert bfrm_BIN, 'cannot find the %s' % bfrm_path num_factor = 1 #num_factor = 10 if 'num_factors' in user_options.keys(): num_factor = int(user_options['num_factors']) assert num_factor >= 1, 'the num_factor should be >=1' # What is single_object? #M = arrayio.read(single_object.identifier) M = arrayio.read(in_data.identifier) col_num = M.ncol() assert num_factor <= col_num, ( 'the num_factor should be less than %d' % col_num) tmp = 'tmp_dir' command = [ 'python', bfrm_BIN, in_data.identifier, '-f', str(num_factor), '-o', tmp ] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(tmp), ( 'the output dir %s for bfrm_normalize fails' % tmp) assert filelib.exists_nz(os.path.join(tmp, 'normalized.gct')), ( 'the output gct file for bfrm_normalize fails') out = os.path.join(tmp, 'normalized.gct') M = arrayio.read(out) M_new = arrayio.convert(M, to_format=arrayio.pcl_format) f = file(outfile, 'w') arrayio.tab_delimited_format.write(M_new, f) f.close()
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os import arrayio from genomicode import jmath from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib metadata = {} norm_para = ["variance", "sum_of_squares"] assert "gene_normalize" in out_attributes normalize = out_attributes["gene_normalize"] assert normalize in norm_para, \ "Invalid normalize option: %s" % normalize if normalize == "variance": f = file(outfile, 'w') M = arrayio.read(in_data.identifier, format=arrayio.pcl_format) M_n = jmath.safe_norm_mv(M.slice()) M._X = M_n M_c = arrayio.convert(M, to_format=arrayio.pcl_format) arrayio.pcl_format.write(M_c, f) f.close() elif normalize == "sum_of_squares": cluster = mlib.get_config("cluster", which_assert_file=True) sq = parallel.quote cmd = [ sq(cluster), "-f", sq(in_data.identifier), "-ng", "-u", outfile, ] parallel.sshell(cmd) metadata["command"] = cmd outputfile = outfile + '.nrm' filelib.assert_exists_nz(outputfile) os.rename(outputfile, outfile) filelib.assert_exists_nz(outfile) return metadata
def run_cluster30(filename, algorithm, user_options, **more_args): import arrayio from genomicode import cluster30 from Betsy import module_utils as mlib MATRIX_FILE = "data.pcl" DISTANCE_MEASURES = cluster30.DIST2ID.keys() YESNO = ["yes", "no"] cluster_genes = mlib.get_user_option(user_options, "cluster_genes", not_empty=True, allowed_values=YESNO) cluster_arrays = mlib.get_user_option(user_options, "cluster_arrays", not_empty=True, allowed_values=YESNO) distance_metric = mlib.get_user_option(user_options, "distance_measure", not_empty=True, allowed_values=DISTANCE_MEASURES) # Make a PCL-formatted file for cluster 3.0. It might # misinterpret the columns of a tab-delimited file. matrix = arrayio.read(filename) matrix = arrayio.convert(matrix, to_format=arrayio.pcl_format) arrayio.write(matrix, open(MATRIX_FILE, 'w')) jobname = "cluster" cmd = cluster30.cluster30_file(MATRIX_FILE, (cluster_genes == "yes"), (cluster_arrays == "yes"), algorithm, distance=distance_metric, jobname=jobname, **more_args) # Find the output files and name them appropriately. cluster_files = cluster30._find_cluster_files(jobname) fix_cluster30_dup_header(cluster_files["cdt"]) return cmd, cluster_files
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): """convert signal file to pcl format""" import shutil import arrayio from genomicode import filelib in_data = antecedents f = file(outfile, 'w') M = arrayio.choose_format(in_data.identifier) if M.__name__[8:-7] == 'pcl': shutil.copyfile(in_data.identifier, outfile) f.close() else: M = arrayio.read(in_data.identifier) M_c = arrayio.convert(M, to_format=arrayio.pcl_format) arrayio.pcl_format.write(M_c, f) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for convert_signal_to_pcl fails' % outfile )
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import arrayio from genomicode import arrayplatformlib as apl MATRIX = arrayio.read(antecedents.identifier) # Converter will just use the first two columns for NAME and # Description. Try to find better ones. # Column 1: PROBE_ID, GENE_ID, <col 1> # Column 2: GENE_SYMBOL, <col 2> num_headers = len(MATRIX.row_names()) cat2header = apl.categorize_headers(MATRIX, remove_version=True) h1 = cat2header.get(apl.PROBE_ID) if not h1: h1 = cat2header.get(apl.GENE_ID) if not h1 and num_headers: h1 = MATRIX.row_names()[0] h2 = cat2header.get(apl.GENE_SYMBOL) if not h2 and num_headers >= 2: h2 = MATRIX.row_names()[1] if h2: i = MATRIX._row_order.index(h2) MATRIX._row_order.pop(i) MATRIX._row_order.insert(0, h2) if h1: i = MATRIX._row_order.index(h1) MATRIX._row_order.pop(i) MATRIX._row_order.insert(0, h1) MATRIX_c = arrayio.convert(MATRIX, to_format=arrayio.gct_format) arrayio.gct_format.write(MATRIX_c, open(outfile, 'w'))
def main(): #from optparse import OptionParser, OptionGroup from optparse import OptionParser usage = "usage: %prog [options] <file1> <file2> ..." parser = OptionParser(usage=usage, version="%prog 01") parser.add_option("-f", "--num_factors", dest="num_factors", type="int", default=15, help="Number of factors to use for normalization.") # Any string in the control probe file can be a control probe. # Delimited by tabs and newlines. parser.add_option("", "--control_probe_file", dest="control_probe_file", default=None, help="File that contains the control probes.") parser.add_option("", "--python", dest="python", default=None, help="Specify the command to run python (optional).") parser.add_option("", "--bfrm", dest="bfrm_path", default=None, help="Specify the path to the BFRM_normalize directory.") parser.add_option("", "--matlab", dest="matlab", default="matlab", help="Specify the command to run matlab.") parser.add_option("", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option("", "--povray", dest="povray", default="povray", help="Specify the command to run povray.") parser.add_option("", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") parser.add_option("", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option("-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option("-z", "--archive", dest="archive", action="store_true", default=None, help="Archive the raw output. Helpful for GenePattern.") # Parse the arguments. options, args = parser.parse_args() if options.libpath: sys.path = options.libpath + sys.path # Import this after the library path is set. import time import arrayio from genomicode import filelib from genomicode import archive from genomicode import genepattern start_time = time.time() genepattern.fix_environ_path() if not args: parser.error("Please specify files to normalize.") filenames = args names = [os.path.split(x)[-1] for x in filenames] for filename in filenames: assert filelib.exists(filename), "File not found: %s" % filename # Check to make sure value for num_factors is reasonable. MIN_FACTORS, MAX_FACTORS = 1, 100 if options.num_factors < MIN_FACTORS: if MIN_FACTORS == 1: parser.error("At least %d factor is required." % MIN_FACTORS) else: parser.error("At least %d factors are required." % MIN_FACTORS) elif options.num_factors > MAX_FACTORS: parser.error("%d factors is too many. Maximum is %d." % (options.num_factors, MAX_FACTORS)) # Set up the files. file_layout = make_file_layout(options.outpath) init_paths(file_layout) # Read each of the input files and align them. matrices = read_matrices(filenames) # Make sure the number of factors don't exceed the size of the # matrices. if matrices and options.num_factors > matrices[0].nrow(): parser.error("Too many factors.") # Standardize each of the matrices to GCT format. if 1: # for debugging for i in range(len(matrices)): matrices[i] = arrayio.convert(matrices[i], to_format=arrayio.gct_format) write_dataset(file_layout.DS_ORIG, matrices) # Log each of the matrices if needed. if 1: # for debugging log_matrices(names, matrices) write_dataset(file_layout.DS_PROC, matrices) sys.stdout.flush() # Format the parameters and output files for bfrm. if 1: # for debugging run_bfrm(options.bfrm_path, options.num_factors, options.control_probe_file, file_layout, options.matlab) # Generate some files for output. if 1: # for debugging summarize_dataset(file_layout) summarize_filtered_genes(file_layout) summarize_heatmaps(options.python, options.arrayplot, options.cluster, file_layout, options.libpath) summarize_pca(options.povray, file_layout, matrices) summarize_report(filenames, matrices, options.num_factors, start_time, file_layout) # Archive the BFRM stuff, and the big files. if options.archive: print "Archiving results." archive.zip_path(file_layout.BFRM, noclobber=False) archive.zip_path(file_layout.ATTIC, noclobber=False) #archive.zip_path(file_layout.DS_PROC, noclobber=False) #archive.zip_path(file_layout.DS_FINAL, noclobber=False) print "Done."
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess from Betsy import module_utils from genomicode import config from genomicode import filelib import arrayio data_node, cls_node = antecedents tmp = os.path.join(".", 'tmp.txt') f = file(tmp, 'w') M = arrayio.read(data_node.identifier) M_c = arrayio.convert(M, to_format=arrayio.gct_format) arrayio.gct_format.write(M_c, f) f.close() module_name = 'ClassNeighbors' gp_parameters = dict() gp_parameters['data.filename'] = tmp gp_parameters['class.filename'] = cls_node.identifier if 'cn_num_neighbors' in user_options: gp_parameters['num.neighbors'] = str( user_options['cn_num_neighbors']) if 'cn_num_perm' in user_options: if user_options['cn_num_perm'].isdigit(): gp_parameters['num.permutations'] = str( user_options['cn_num_perm']) if 'cn_user_pval' in user_options: if module_utils.is_number(user_options['cn_user_pval']): gp_parameters['user.pval'] = str(user_options['cn_user_pval']) mean_median = {'mean': '', 'median': '-d'} if out_attributes['cn_mean_or_median'] in ['mean', 'median']: gp_parameters['mean.or.median'] = mean_median[ out_attributes['cn_mean_or_median']] p = {'t_test': '', 'snr': '-S'} if out_attributes['cn_ttest_or_snr'] in p.values(): gp_parameters['ttest.or.snr'] = p[ out_attributes['cn_ttest_or_snr']] if out_attributes['cn_filter_data'] in ['yes', 'no']: gp_parameters['filter.data'] = str( out_attributes['cn_filter_data']) if 'cn_abs_diff' in user_options: if module_utils.is_number(user_options['cn_abs_diff']): gp_parameters['min.abs.diff'] = str( user_options['cn_abs_diff']) if 'cn_min_threshold' in user_options: if module_utils.is_number(user_options['cn_min_threshold']): gp_parameters['min.threshold'] = str( user_options['cn_min_threshold']) if 'cn_max_threshold' in user_options: if module_utils.is_number(user_options['cn_max_threshold']): gp_parameters['max.threshold'] = str( user_options['cn_max_threshold']) if 'cn_min_folddiff' in user_options: if module_utils.is_number(user_options['cn_min_folddiff']): gp_parameters['min.fold.diff'] = str( user_options['cn_min_folddiff']) gp_path = config.genepattern gp_module = module_utils.which(gp_path) assert gp_module, 'cannot find the %s' % gp_path download_directory = os.path.join(".", 'class_neighbors_result') command = [gp_module, module_name, '-o', download_directory] for key in gp_parameters.keys(): a = ['--parameters', key + ':' + gp_parameters[key]] command.extend(a) process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert os.path.exists(download_directory), ( 'there is no output directory for class_neighbors') result_files = os.listdir(download_directory) assert 'stderr.txt' not in result_files, 'gene_pattern get error' os.remove(tmp) gene_list = [] for result_file in result_files: if result_file.endswith('.odf'): f = file(os.path.join(download_directory, result_file), 'r') text = f.read() text = text.split('\n') f.close() numline = 8 startline = 14 assert text[numline].startswith( 'NumNeighbors'), 'the odf file format is not right' number_gene = int(text[numline].split('=')[1]) assert text[startline].startswith( '1'), 'the start line is not right' for line in text[startline:startline + number_gene]: lines = line.split('\t') gene_list.append(lines[10]) f = file(outfile, 'w') f.write('\t'.join(gene_list)) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for rank_genes_by_class_neighbors fails' % outfile)
def main(): import os import argparse import subprocess import StringIO import zipfile import shutil import arrayio from genomicode import config from genomicode import arraysetlib parser = argparse.ArgumentParser(description="Do a GSEA analysis.") parser.add_argument("expression_file", help="Gene expression file.") parser.add_argument("outpath", help="Where to save the files.") parser.add_argument("--clobber", default=False, action="store_true", help="Overwrite outpath, if it already exists.") parser.add_argument("--dry_run", default=False, action="store_true", help="Set up the file, but do not run GSEA.") group = parser.add_argument_group(title="Class Labels") group.add_argument("--cls_file", default=None, help="Class label file.") group.add_argument("--indexes1", default=None, help="Which columns in group 1, E.g. 1-5,8 (1-based, " "inclusive).") group.add_argument( "--indexes2", default=None, help="(OPTIONAL) Which columns in group 2. If not given, then " "will use any sample not included in --indexes1.") group.add_argument( "--indexes_include_headers", "--iih", action="store_true", help="If not given (default), then column 1 is the first column " "with data. If given, then column 1 is the very first column in " "the file, including the headers.") group.add_argument("--name1", default=None, help="Name for group 1.") group.add_argument("--name2", default=None, help="Name for group 2.") group = parser.add_argument_group(title="Other Parameters") group.add_argument( "--platform", default=None, help="The platform (GenePattern chip) of the expression data, " "e.g. HG_U133A_2.chip. You should leave this blank if the IDs " "in the gene expression data set are gene symbols. Allowed " "values can be found on the GenePattern/GSEA website.") group.add_argument( '--min_match_score', default=0.95, type=float, help="When trying to identify the rows of a matrix or geneset, " "require at least this portion of the IDs to be recognized.") x = sorted(DATABASE2GENESET) x = [x.replace(DEFAULT_DATABASE, "%s (DEFAULT)" % x) for x in x] x = ", ".join(x) x = "Which database to search. Possible values are: %s." % x group.add_argument("--database", default=DEFAULT_DATABASE, help=x) group.add_argument( "--database_file", default=None, help="Search a GMT or GMX file instead of the default databases.") group.add_argument( "--no_collapse_dataset", default=False, action="store_true", help="Do not 1) convert gene IDs to gene symbols, and do not 2) " "collapse duplicate gene symbols. Set this if the gene IDs are " "already unique gene symbols. Also, can use this if you " "provide the database_file and the gene IDs match the ones " "in our gene expression file.") # phenotype is more accurate. But if only 2 samples, need to be # gene_set. (Not sure about 3 samples? Where is the limit?) group.add_argument( "--permutation_type", default="phenotype", choices=["phenotype", "gene_set"], help="Default is phenotype. With <= 6 samples, recommend using " "gene_set instead.") args = parser.parse_args() assert os.path.exists(args.expression_file), \ "File not found: %s" % args.expression_file assert type(args.min_match_score) is type(0.0) assert args.min_match_score > 0.2, "min_match_score too low" assert args.min_match_score <= 1.0, "min_match_score too high" # Must have either the indexes or the cls_file, but not both. assert args.cls_file or args.indexes1, ( "Must provide either CLS file or the indexes for one group.") assert not (args.cls_file and args.indexes1), ( "Cannot provide both a CLS file and the indexes.") assert not (args.cls_file and args.indexes2), ( "Cannot provide both a CLS file and the indexes.") assert not (args.indexes2 and not args.indexes1) if args.cls_file: assert os.path.exists(args.cls_file), \ "File not found: %s" % args.cls_file assert not args.name1 assert not args.name2 assert args.outpath, "Please specify an outpath." assert not os.path.exists(args.outpath) or args.clobber, \ "Outpath %s already exists." % args.outpath if os.path.exists(args.outpath): shutil.rmtree(args.outpath) os.mkdir(args.outpath) MATRIX = arrayio.read(args.expression_file) # Make a CLS file, if necessary. if args.cls_file: names, classes = arraysetlib.read_cls_file(args.cls_file) assert len(names) == 2, "I must have 2 classes." name1, name2 = names else: x = arraysetlib.resolve_classes(MATRIX, args.indexes1, args.indexes2, args.indexes_include_headers, args.name1, args.name2) name1, name2, classes = x x = fix_class_order(MATRIX, name1, name2, classes) MATRIX, name1, name2, classes = x # Remove samples that aren't in any classes. for c in classes: assert c in [0, 1, None] I = [i for (i, x) in enumerate(classes) if x is not None] classes = [classes[i] for i in I] MATRIX = MATRIX.matrix(None, I) handle = StringIO.StringIO() arraysetlib.write_cls_file(handle, name1, name2, classes) cls_data = handle.getvalue() # Convert the format after making CLS file, or else args.indexes1 # with args.indexes_include_headers might be off. # BUG: What if the conversion to GCT discards the proper platform # of this matrix? MATRIX = arrayio.convert(MATRIX, to_format=arrayio.gct_format) database_file = None if args.database_file: database_file = os.path.realpath(args.database_file) assert os.path.exists(database_file), ("I could not find file: %s" % database_file) # Required, even if gene.sets.database.file is given. gene_set_database = format_gene_set_database(args.database) # If no database file is given, then we need to know the platform # for the expression data. (If one is given, we let the user make # sure the platforms match.) platform = args.platform if platform is None and not database_file and not args.no_collapse_dataset: platform = guess_chip_platform(MATRIX, args.min_match_score) # If gene symbols already provided, then turn off collapse_dataset. if platform is None: # Gene Symbol args.no_collapse_dataset = True # Do some sanity checking to make sure imputs are reasonable. MATRIX = check_matrix(MATRIX) check_classes(classes, args.permutation_type) # Set up file names. opj = os.path.join x = os.path.split(args.expression_file)[1] if x.lower().endswith(".gz"): x = x[:-3] x = os.path.splitext(x)[0] x = x.replace(" ", "_") # GenePattern cannot work with spaces. assert x, "empty file name" gct_file = "%s.gct" % x cls_file = "%s.cls" % x out_file = "%s.zip" % x # GenePattern saves output to <file>.zip. gct_full = opj(args.outpath, gct_file) cls_full = opj(args.outpath, cls_file) out_full = opj(args.outpath, out_file) if database_file: db_file = os.path.split(database_file)[1] db_full = opj(args.outpath, db_file) # Write the gene expression, class label, and database files. It # is better to have local copies of the files. It is unclear how # to upload files to GenePattern if the file names have spaces in # them. Get around this by making all the files local. arrayio.gct_format.write(MATRIX, open(gct_full, 'w')) open(cls_full, 'w').write(cls_data) if database_file: open(db_full, 'w').write(open(database_file).read()) collapse_dataset = "true" if args.no_collapse_dataset: collapse_dataset = "false" # Set up the analysis. params = { "expression.dataset": gct_file, "phenotype.labels": cls_file, "collapse.dataset": collapse_dataset, "permutation.type": args.permutation_type, } # platform is required, even if collapse.dataset is false. If no # platform is given, then specify a default one. #CHIP_PLATFORM = "chip.platform" CHIP_PLATFORM = "chip.platform.file" params[CHIP_PLATFORM] = platform if params[CHIP_PLATFORM] is None: params[CHIP_PLATFORM] = "HG_U133A.chip" if database_file: params["gene.sets.database.file"] = db_file # Required, even if gene.sets.database.file is given. params["gene.sets.database"] = gene_set_database if args.dry_run: return cmd = [config.genepattern, "-o", ".", "GSEA"] for (key, value) in reversed(list(params.iteritems())): x = ["--parameters", "%s:%s" % (key, value)] cmd.extend(x) #print " ".join(cmd) #import sys; sys.exit(0) # Run the analysis in the outpath. GSEA leaves a file # "System.out" in the current directory. cwd = os.getcwd() try: os.chdir(args.outpath) p = subprocess.Popen(cmd, bufsize=0, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) w, r = p.stdin, p.stdout w.close() # Check for errors in the output. x = r.read() # Get rid of GenePattern garbage. data = x.replace("Loading required package: rJava", "") p.wait() finally: os.chdir(cwd) x = data.strip() # rpy2 generates UserWarnings for some reason. # /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/ # python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: # res = super(Function, self).__call__(*new_args, **new_kwargs) if x.find("UserWarning") >= 0 and x.endswith("(*new_args, **new_kwargs)"): # Ignore this UserWarning. x = "" # Also ignore this RRuntimeWarning. # /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/ # python2.7/site-packages/rpy2/rinterface/__init__.py:185: # RRuntimeWarning: # # warnings.warn(x, RRuntimeWarning) if x.find("RRuntimeWarning") >= 0 and \ x.endswith("warnings.warn(x, RRuntimeWarning)"): x = "" assert not x, "%s\n%s" % (" ".join(map(str, cmd)), data) error_file = os.path.join(args.outpath, "stderr.txt") assert not os.path.exists(error_file), ( "Error generated by GenePattern:\n%s" % open(error_file).read()) # Unzip the zipped results in the outpath. assert os.path.exists(out_full), "ERROR: Output file is missing [%s]." % \ out_full zfile = zipfile.ZipFile(out_full) zfile.extractall(args.outpath) os.unlink(out_full) x = os.path.join(args.outpath, "index.html") assert os.path.exists(x), "I could not find the GSEA output: %s" % x
def main(): from optparse import OptionParser, OptionGroup usage = "usage: %prog [options] <bfrm_model> <dataset>" parser = OptionParser(usage=usage, version="%prog 01") parser.add_option("", "--bfrm_path", dest="bfrm_path", default=None, help="Specify the path to BFRM_project.") parser.add_option("", "--matlab", dest="matlab", default="matlab", help="Specify the command to run matlab.") parser.add_option("", "--python", dest="python", default=None, help="Specify the command to run python (optional).") parser.add_option("", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option("", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") parser.add_option("", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option("-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option("-z", "--archive", dest="archive", action="store_true", default=None, help="Archive the raw output. Helpful for GenePattern.") # Parse the arguments. options, args = parser.parse_args() if options.libpath: sys.path = options.libpath + sys.path # Import this after the library path is set. import arrayio from genomicode import archive from genomicode import genepattern genepattern.fix_environ_path() if len(args) != 2: parser.error("Please specify files.") model_file, filename = args assert os.path.exists(model_file), "File not found: %s" % model_file assert os.path.exists(filename), "File not found: %s" % filename # Set up the files. file_layout = make_file_layout(options.outpath) init_paths(file_layout) # Read the matrix and convert to GCT format. x = arrayio.read(filename) MATRIX = arrayio.convert(x, to_format=arrayio.gct_format) print "Read data set with %d genes and %d samples." % (MATRIX.nrow(), MATRIX.ncol()) log_matrix(MATRIX) # Write out the data sets. write_dataset(file_layout.DATASET, MATRIX) # Save the BFRM model. write_model(model_file, file_layout) # Run BFRM projection. run_bfrm_project(file_layout, options.bfrm_path, options.matlab) # Generate output files. summarize_factor_scores(file_layout, options.python, options.arrayplot, options.cluster, options.libpath) if options.archive: print "Archiving results." archive.zip_path(file_layout.ATTIC, noclobber=False) archive.zip_path(file_layout.BFRM, noclobber=False) print "Done."
def main(): from optparse import OptionParser, OptionGroup usage = "usage: %prog [options]" parser = OptionParser(usage=usage, version="%prog 01") parser.add_option( "-r", "--rma", dest="rma_dataset", type="string", default=None, help="Specify the RMA-normalized data to analyze.") parser.add_option( "-m", "--mas5", dest="mas5_dataset", type="string", default=None, help="Specify the MAS5-normalized data to analyze.") parser.add_option( "-i", "--illu", dest="illu_dataset", type="string", default=None, help="Specify the Illumina data to analyze.") parser.add_option( "", "--sigdb_path", dest="sigdb_path", type="string", default=None, help="Location of the sigdb/ directory.") parser.add_option( "", "--sigtag", dest="signature_tags", default=[], action="append", help="Specify a specific tag to use.") parser.add_option( "", "--sigid", dest="signature_ids", default=[], action="append", help="Specify a specific signature to use.") parser.add_option( "", "--max_signatures", dest="max_signatures", type="int", default=None, help="Maximum number of signatures to run (for DEBUGGING).") parser.add_option( "-j", "", dest="num_procs", type="int", default=1, help="Number of jobs to run in parallel.") parser.add_option( "-z", "", dest="archive", action="store_true", default=False, help="Archive the individual signatures. Helpful for GenePattern.") parser.add_option( "", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option( "-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option( "", "--gp_imod_all_vars", dest="gp_imod_all_vars", type="string", default=None, help="Special internal variable for use with GenePattern " "interactive modules.") parser.add_option( "", "--debug_gp_imod_all_vars", action="store_true", default=False, dest="debug_gp_imod_all_vars", ) #group = OptionGroup(parser, "Normalization") #group.add_option( # "", "--normalization", dest="normalization", default="MAS5", # help="How was the data set normalized (default MAS5).") #group.add_option( # "-l", "--log_data", dest="log_data", action="store_true", # default=False, # help="Log the MAS5 data before analyzing.") #parser.add_option_group(group) group = OptionGroup(parser, "Pybinreg") group.add_option( "", "--python", dest="python", default=None, help="Specify the command to run python.") group.add_option( "", "--matlab", dest="matlab", default=None, help="Specify the command to run matlab.") group.add_option( "", "--povray", dest="povray", default=None, help="Specify the command to run povray.") group.add_option( "", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") group.add_option( "", "--binreg", dest="binreg_path", default=None, help="Specify the path to the BinReg2.0 code.") group.add_option( "", "--pybinreg", dest="pybinreg", default=None, help="Specify the command to run pybinreg.py.") group.add_option( "", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option_group(group) options, args = parser.parse_args() #if len(args) < 1: # #print sys.argv # #print len(args), args # parser.error("Please specify sigdb_path.") #elif len(args) > 1: # parser.error("Too many arguments.") if args: parser.error("Too many arguments.") # DEBUG the gp_imod_all_vars variable. if options.debug_gp_imod_all_vars: assert not options.gp_imod_all_vars options.gp_imod_all_vars = ( "mas5_expression_file_cb=file&mas5_expression_file_url=&" "rma_expression_file_cb=file&rma_expression_file_url=&" # Skip AKT signature. "sig_AKT=no&" # Change BCAT normalization. "sig_BCAT=yes (custom parameters)&" "sig_BCAT_apply_quantile_normalization=no&" "sig_BCAT_apply_shiftscale_normalization=no&" "sig_BCAT_num_genes=85&sig_BCAT_num_metagenes=2&" # No changes in E2F1. "sig_E2F1=yes (custom parameters)&" "sig_E2F1_apply_quantile_normalization=yes&" "sig_E2F1_apply_shiftscale_normalization=yes&" "sig_E2F1_num_genes=150&sig_E2F1_num_metagenes=2&" # Change genes in EGFR. "sig_EGFR=yes (custom parameters)&" "sig_EGFR_apply_quantile_normalization=no&" "sig_EGFR_apply_shiftscale_normalization=yes&" #"sig_EGFR_num_genes=50000&sig_EGFR_num_metagenes=2&" "sig_EGFR_num_genes=501&sig_EGFR_num_metagenes=2&" # Change quantile, genes, metagenes in ER. "sig_ER=yes (custom parameters)&" "sig_ER_apply_quantile_normalization=no&" "sig_ER_apply_shiftscale_normalization=yes&" "sig_ER_num_genes=150&sig_ER_num_metagenes=3&" "sig_HER2=yes (default parameters)&" "sig_IFNalpha=yes (default parameters)&" "sig_IFNgamma=yes (default parameters)&" "sig_MYC=yes (default parameters)&" "sig_P53=yes (default parameters)&" "sig_P63=yes (default parameters)&" "sig_PI3K=yes (default parameters)&" "sig_PR=yes (default parameters)&" "sig_RAS=yes (default parameters)&" "sig_SRC=yes (default parameters)&" "sig_STAT3=yes (default parameters)&" "sig_TGFB=yes (default parameters)&" "sig_TNFa=yes (default parameters)&" "which_signatures=I choose myself" ) datafile_rma = datafile_mas5 = datafile_illu = None if options.rma_dataset is not None: assert os.path.exists(options.rma_dataset), \ "RMA file not found: %s" % options.rma_dataset datafile_rma = os.path.realpath(options.rma_dataset) if options.mas5_dataset is not None: assert os.path.exists(options.mas5_dataset), \ "MAS5 file not found: %s" % options.mas5_dataset datafile_mas5 = os.path.realpath(options.mas5_dataset) if options.illu_dataset is not None: assert os.path.exists(options.illu_dataset), \ "ILLU file not found: %s" % options.illu_dataset datafile_illu = os.path.realpath(options.illu_dataset) assert datafile_rma or datafile_mas5 or datafile_illu, \ "Please specify at least one data set." if options.libpath: sys.path = options.libpath + sys.path # Import after the library path is set. import time import arrayio from genomicode import config from genomicode import parallel from genomicode import archive from genomicode import hashlib from genomicode import matrixlib from genomicode import genepattern #sigdb_path, = args x = options.sigdb_path or config.sigdb_path sigdb_path = os.path.realpath(x) assert os.path.exists(sigdb_path), \ "I could not find the signatures database: %s." % sigdb_path start_time = time.time() genepattern.fix_environ_path() file_layout = make_file_layout(options.outpath) init_paths(file_layout) # Read the signatures and select the ones to score. # BUG: Should allow this to be specified on the command line. desired_tags = ["Pathway"] # default if options.signature_tags: desired_tags = options.signature_tags[:] all_normalization = ["RMA", "MAS5", "ILLU"] desired_normalization = [] if datafile_rma is not None: # RMA datafile is specified. desired_normalization.append("RMA") if datafile_mas5 is not None: # MAS5 datafile is specified. desired_normalization.append("MAS5") if datafile_illu is not None: # ILLU datafile is specified. desired_normalization.append("ILLU") # If any signature IDs are specified, then use only those IDs and # ignore the desired tags. print "Reading signature database: %s." % sigdb_path desired_ids = [] if options.signature_ids: desired_ids = options.signature_ids[:] x = read_signatures( sigdb_path, all_normalization, desired_ids, desired_tags) signatures = x orig_signatures = signatures[:] # Filter for just the normalization that we have data files for. # Keep track of why we filtered out certain signatures. why_dropped = {} # ID -> explanation as string good = [] for sig in signatures: if sig.Normalization.upper() in desired_normalization: good.append(sig) continue x = "Signature requires %s normalized data, but it was not provided."%( sig.Normalization.upper()) why_dropped[sig.xID] = x signatures = good assert signatures, "No signatures available." # Process additional parameters from GenePattern. # o Do this before max_signatures, so that the maximum signatures # is selected only out of the ones that the user specified. # o Do this before names and paths, so the variables will be # aligned. # gp_imod_all_vars can be None or "". if options.gp_imod_all_vars: x = process_gp_imod_all_vars( options.gp_imod_all_vars, signatures, why_dropped) signatures, why_dropped = x sys.stdout.flush() DATA_rma = DATA_mas5 = DATA_illu = None if datafile_rma is not None: print "Reading RMA file: %s" % datafile_rma DATA_rma = arrayio.read(datafile_rma) DATA_rma = arrayio.convert(DATA_rma, to_format=arrayio.gct_format) if datafile_mas5 is not None: print "Reading MAS5 file: %s" % datafile_mas5 DATA_mas5 = arrayio.read(datafile_mas5) DATA_mas5 = arrayio.convert(DATA_mas5, to_format=arrayio.gct_format) if datafile_illu is not None: print "Reading ILLU file: %s" % datafile_illu DATA_illu = arrayio.read(datafile_illu) DATA_illu = arrayio.convert(DATA_illu, to_format=arrayio.gct_format) # Don't handle the log. Let pybinreg do it. # Make sure the data sets contain the same samples. Align them if # necessary. DATA_all = [ ("DATA_rma", DATA_rma), ("DATA_mas5", DATA_mas5), ("DATA_illu", DATA_illu)] DATA_all = [x for x in DATA_all if x[1]] for i in range(1, len(DATA_all)): key1, data1 = DATA_all[0] key2, data2 = DATA_all[i] assert key1 != key2 assert data1 and data2 assert data1.ncol() == data2.ncol(), \ "%s and %s data sets have different numbers of samples." % ( key1, key2) if matrixlib.are_cols_aligned(data1, data2): continue x = matrixlib.align_cols(data1, data2) data1_new, data2_new = x assert matrixlib.are_cols_aligned(data1_new, data2_new) # The samples in data1 (the reference) should not be changed. assert data1.ncol() == data1_new.ncol(), \ "%s and %s data sets have different samples" % ( key1, key2) assert matrixlib.are_cols_aligned(data1, data1_new) DATA_all[i] = key2, data2_new for key, data in DATA_all: if key == "DATA_rma": DATA_rma = data elif key == "DATA_mas5": DATA_mas5 = data elif key == "DATA_illu": DATA_illu = data else: raise AssertionError, "Unknown key: %s" % key print "Writing aligned signal files." if DATA_rma: arrayio.gct_format.write( DATA_rma, open(file_layout.DATASET_RMA, 'w')) if DATA_mas5: arrayio.gct_format.write( DATA_mas5, open(file_layout.DATASET_MAS5, 'w')) if DATA_illu: arrayio.gct_format.write( DATA_illu, open(file_layout.DATASET_ILLU, 'w')) # Figure out the names and paths for each signature. print "Finding signatures." names = [None] * len(signatures) # SIG19_AKT[_modified] paths = [None] * len(signatures) # <path>/SIG19_AKT[_modified] for i, sig in enumerate(signatures): name = "SIG%02d_%s" % (sig.xID, hashlib.hash_var(sig.Name)) # If the user has modified the signature from the default # parameters, then make a note of it. if getattr(sig, "Changed", False): name = "%s_modified" % name outpath = os.path.join(file_layout.OUTPATH, name) names[i] = name paths[i] = outpath if options.max_signatures is not None: signatures = signatures[:options.max_signatures] # Make a list of the jobs. jobs = [] # list of cmd, outpath, outfile for i, sig in enumerate(signatures): name, outpath = names[i], paths[i] #print "Generating signature %s [%d:%d]" % ( # name, i+1, len(signatures)) #sys.stdout.flush() quantile_normalize = False assert sig.Quantile.upper() in ["YES", "NO"] if sig.Quantile.upper() == "YES": quantile_normalize = True shift_scale_normalize = False assert sig.Shift_Scale.upper() in ["YES", "NO"] if sig.Shift_Scale.upper() == "YES": shift_scale_normalize = True #outfile = os.path.join(files.outpath, "%s.out.txt" % name) outfile = os.path.join(outpath, "out.txt") if sig.Normalization.upper() == "RMA": datafile = file_layout.DATASET_RMA assert DATA_rma elif sig.Normalization.upper() == "MAS5": datafile = file_layout.DATASET_MAS5 assert DATA_mas5 elif sig.Normalization.upper() == "ILLU": datafile = file_layout.DATASET_ILLU assert DATA_illu else: raise AssertionError, "Unknown normalization." # If the entire analysis should be archived, then go ahead and # archive each of the pybinreg runs too. This will prevent # large analyses from taking up too much disk space. The # drawback is that the files that are archived are no longer # available for use here. Hopefully this won't be a problem. cmd = make_pybinreg_cmd( options.pybinreg, options.python, options.binreg_path, options.matlab, options.arrayplot, options.povray, options.cluster, options.libpath, outpath, options.archive, sig.Genes, sig.Metagenes, quantile_normalize, shift_scale_normalize, sig.Train0, sig.Train1, datafile) x = cmd, outpath, outfile jobs.append(x) # Run each of the jobs. if options.num_procs < 1 or options.num_procs > 100: parser.error("Please specify between 1 and 100 processes.") if options.num_procs > 1: if parallel._find_parallel(): num_sigs = min(options.num_procs, len(jobs)) if num_sigs > 1: print "Predicting %d signatures at a time." % num_sigs else: print("I could not find GNU parallel. " "Predicting 1 signature at a time.") options.num_procs = 1 sys.stdout.flush() DEBUG = False # Can disable pybinreg temporarily for debugging. if not DEBUG: if options.num_procs <= 1: for x in jobs: cmd, outpath, outfile = x run_one_pybinreg(cmd, outpath, outfile) else: run_many_pybinreg(jobs, options.num_procs) if signatures: print "Extracting the reports from each signature." report_files = extract_reports(names, paths, file_layout) print "Combining probabilities from each of the signatures." summarize_probabilities(signatures, names, paths, file_layout) print "Making heatmap of the results." sys.stdout.flush() summarize_heatmap( options.python, options.arrayplot, options.cluster, options.libpath, file_layout) print "Summarizing signatures." summarize_signatures(signatures, file_layout) print "Making a report." analysis_name = make_analysis_name(options) summarize_report( analysis_name, signatures, orig_signatures, report_files, start_time, why_dropped, file_layout) if options.archive: print "Compressing results." sys.stdout.flush() archive.zip_path(file_layout.ATTIC) for i, sig in enumerate(signatures): name, outpath = names[i], paths[i] archive.zip_path(outpath) print "Done."
def main(): from optparse import OptionParser, OptionGroup usage = "usage: %prog [options] <dataset>" parser = OptionParser(usage=usage, version="%prog 01") parser.add_option("", "--python", dest="python", default=None, help="Specify the command to run python (optional).") parser.add_option("", "--bfrm_bin", dest="bfrm_bin", default=None, help="Specify the path to the BFRM binary.") parser.add_option("", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option("", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") parser.add_option("", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option("-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option("-z", "--archive", dest="archive", action="store_true", default=None, help="Archive the raw output. Helpful for GenePattern.") group = OptionGroup(parser, "Filtering") group.add_option( "--filter_mean", dest="filter_mean", type=float, default=None, help="Remove this portion of genes based on mean expression.") group.add_option("--filter_var", dest="filter_var", type=float, default=None, help="Remove this portion of genes based on variance.") group.add_option("--cutoff", dest="cutoff", type=float, default=0.99, help="Cutoff probability for a gene to be in a factor.") parser.add_option_group(group) group = OptionGroup(parser, "BFRM Parameters") group.add_option("--nc", dest="num_control_vars", type="int", default=None, help="Specify the number of control variables to use.") group.add_option( "--num_factors", dest="num_factors", type="int", default=None, help="The number of factors to fit. " "For evolutionary search, starts with this number of factors.") group.add_option( "--design_file", dest="design_file", default=None, help="A file containing a matrix with additional design variables.") group.add_option( "--nucleus_file", dest="nucleus_file", default=None, help="A file that contains the genes to start the evolution. " "This should be a text file that contains a whitespace-separated " "list of genes. If this or --nucleus_geneset is given, " "the evolutionary search will be turned on.") group.add_option( "--nucleus_geneset", dest="nucleus_geneset", default=None, help="A gene set that contains the genes to start the evolution. " "Format: <gmx/gmt_file>[,<geneset>,<geneset>,...]") group.add_option("--evol_max_factors", dest="evol_max_factors", default=None, help="Maximum number of factors for the evolution.") group.add_option("--evol_max_genes", dest="evol_max_genes", default=None, help="Maximum number of genes for the evolution.") parser.add_option_group(group) # Parse the arguments. options, args = parser.parse_args() if options.cutoff <= 0 or options.cutoff > 1: parser.error("Cutoff probability should be between 0 and 1.") if options.filter_mean and (options.filter_mean < 0 or options.filter_mean >= 1): parser.error("filter_mean filter should be between 0 and 1.") if options.filter_var and (options.filter_var < 0 or options.filter_var >= 1): parser.error("filter_var filter should be between 0 and 1.") if options.libpath: sys.path = options.libpath + sys.path # Import this after the library path is set. import arrayio from genomicode import archive from genomicode import genepattern genepattern.fix_environ_path() if len(args) != 1: parser.error("Please specify a file to factor.") filename, = args assert os.path.exists(filename), "File not found: %s" % filename if options.nucleus_file and options.nucleus_geneset: parser.error("Please specify either nucleus_file or nucleus_geneset.") nucleus = None if options.nucleus_file: nucleus = _read_nucleus_file(options.nucleus_file) elif options.nucleus_geneset: nucleus = _read_nucleus_geneset(options.nucleus_geneset) # Not sure if this is necessary. Don't know if BFRM will provide # a default if not given. if nucleus: assert options.num_factors, "Please specify number of factors." # Set up the files. file_layout = make_file_layout(options.outpath) init_paths(file_layout) # Read the matrix and convert to GCT format. x = arrayio.read(filename) MATRIX_orig = arrayio.convert(x, to_format=arrayio.gct_format) print "Read data set with %d genes and %d samples." % (MATRIX_orig.nrow(), MATRIX_orig.ncol()) # Make a copy so that in-place changes (like log_matrix) won't # affect the original matrix. MATRIX = MATRIX_orig.matrix() # Log the data set if necessary. log_matrix(MATRIX) # Filter out based on mean and varian MATRIX = filter_dataset(MATRIX, options.filter_mean, options.filter_var) if MATRIX.nrow() != MATRIX_orig.nrow(): print "Filtered from %d genes to %d." % (MATRIX_orig.nrow(), MATRIX.nrow()) # Write out the data sets. write_dataset(file_layout.DATASET_ORIG, MATRIX_orig) write_dataset(file_layout.DATASET, MATRIX) # Run BFRM. DEBUG = False if not DEBUG: run_bfrm(file_layout, options.bfrm_bin, options.num_control_vars, options.num_factors, options.design_file, nucleus, options.evol_max_factors, options.evol_max_genes) # Generate output files. summarize_factor_scores(file_layout, options.cutoff, options.python, options.arrayplot, options.cluster, options.libpath) summarize_gene_factor_probs(file_layout, options.cutoff, options.python, options.arrayplot, options.cluster, options.libpath) summarize_factor_geneset(file_layout, options.cutoff) # BFRM model file should always be archived. archive.zip_path(file_layout.BFRM, noclobber=False) if options.archive: print "Archiving results." archive.zip_path(file_layout.ATTIC, noclobber=False) print "Done."
def main(): from optparse import OptionParser, OptionGroup # matrix_file should be a pathway x sample file. usage = "usage: %prog [options] <dataset>" parser = OptionParser(usage=usage, version="%prog 01") parser.add_option("", "--selap", dest="selap_path", default=None, help="Specify the path to SELAPv3.") parser.add_option("", "--matlab", dest="matlab", default="matlab", help="Specify the command to run matlab.") parser.add_option("", "--python", dest="python", default=None, help="Specify the command to run python (optional).") parser.add_option("", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option("", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") # This doesn't give as much control over exactly which python # version is run. #parser.add_option( # "", "--binpath", dest="binpath", action="append", default=[], # help="Add to the binary search path.") parser.add_option("", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option("-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option("-z", "--archive", dest="archive", action="store_true", default=None, help="Archive the raw output. Helpful for GenePattern.") group = OptionGroup(parser, "Model Parameters") # Higher numbers have more groups. # Range from 0 and lower. group.add_option( "-p", "--penalty", dest="penalty", default="-33", help="Penalty for tuning number of subgroups (default -33).") group.add_option( "-m", "--model", dest="model_file", default=None, help="Specify a file that contains a pre-built subtype model.") parser.add_option_group(group) # Parse the input arguments. options, args = parser.parse_args() if len(args) != 1: parser.error("Please specify a file with pathway probabilities.") filename, = args if not os.path.exists(filename): parser.error("I could not find file %s." % filename) if options.penalty.find(".") >= 0: parser.error("Penalties should be integers.") if options.libpath: sys.path = options.libpath + sys.path # Import after the library path is set. import arrayio from genomicode import genepattern from genomicode import archive from genomicode import parselib genepattern.fix_environ_path() # Maximum number of models that someone can create at a time. MAX_MODELS = 50 # Allow people to supply more than one penalty. Parse into a list # of ranges. Penalties must be integers. penalties = [] for (start, end) in parselib.parse_ranges(options.penalty): penalties.extend(range(start, end + 1)) assert len(penalties) <= MAX_MODELS, "Too many penalties (max is %d)." % \ MAX_MODELS assert penalties, "At least one penalty must be specified." assert not (options.model_file and len(penalties) != 1) for p in penalties: assert p <= 0, "Penalties should be negative." num_analyses = len(penalties) # Set up the files. file_layout = make_file_layout(options.outpath, num_analyses, penalties[0]) init_paths(file_layout) # Read the matrix and convert to GCT format. MATRIX = arrayio.read(filename) MATRIX = arrayio.convert(MATRIX, to_format=arrayio.gct_format) # Align this matrix to the SELAP model, if it already exists. if options.model_file: MATRIX = align_dataset(MATRIX, options.model_file) # Write out the data set. write_dataset(file_layout.DATASET, MATRIX) for penalty in penalties: # Set up the files. file_layout = make_file_layout(options.outpath, num_analyses, penalty) init_paths(file_layout) # Make the model. write_selap_dataset(file_layout) if options.model_file: write_model(options.model_file, file_layout) else: make_model(options.selap_path, penalty, file_layout, options.matlab) # Predict the subgroups. predict_subgroups(options.selap_path, file_layout, options.matlab) # Generate some files for output. summarize_predictions(file_layout) summarize_heatmap(options.python, options.arrayplot, options.cluster, file_layout, options.libpath) # Archive the SELAP stuff, and any other big files. if options.archive: print "Archiving results." archive.zip_path(file_layout.SELAP, noclobber=False) archive.zip_path(file_layout.ATTIC, noclobber=False) if num_analyses <= 1: continue # Now do some cleanup if multiple analyses were requested. # If there were multiple penalties specified, make a copy of # some files for convenience. fl = file_layout files_to_copy = [ (fl.PREDICTIONS_PCL, fl.GLOBAL_PREDICTIONS_PCL), (fl.PREDICTIONS_PNG, fl.GLOBAL_PREDICTIONS_PNG), ] for src, dst in files_to_copy: assert os.path.exists(src) os.system("cp -p '%s' '%s'" % (src, dst)) if options.archive: archive.zip_path(file_layout.ANALYSIS) sys.stdout.flush() if num_analyses > 1: summarize_subgroups(options.outpath, num_analyses, penalties) print "Done."
def test_format_conversion(): import StringIO import arrayio file_jeff = "samples/0159_cl.small.rma" file_pcl = "samples/0159_cl.small.pcl" file_gct = "samples/0159_cl.small.gct" # Test choose_format. fmt = arrayio.choose_format(file_jeff) test(print_fn, (fmt.__name__,), {}, "arrayio.jeffs_format") fmt = arrayio.choose_format(file_pcl) test(print_fn, (fmt.__name__,), {}, "arrayio.pcl_format") fmt = arrayio.choose_format(file_gct) test(print_fn, (fmt.__name__,), {}, "arrayio.gct_format") # Test guess_format. X_jeff = arrayio.read(file_jeff, datatype=None) X_pcl = arrayio.read(file_pcl, datatype=None) X_gct = arrayio.read(file_gct, datatype=None) fmt = arrayio.guess_format(X_jeff) test(print_fn, (fmt.__name__,), {}, "arrayio.jeffs_format") fmt = arrayio.guess_format(X_pcl) test(print_fn, (fmt.__name__,), {}, "arrayio.pcl_format") fmt = arrayio.guess_format(X_gct) test(print_fn, (fmt.__name__,), {}, "arrayio.gct_format") # Read the matrix. No format conversion, or float conversion # might mess things up. X_jeff = arrayio.read(file_jeff, datatype=None) # Test convert. # _jeff_to_pcl handle = StringIO.StringIO() X_pcl = arrayio.convert(X_jeff, to_format=arrayio.pcl_format) arrayio.pcl_format.write(X_pcl, handle) #test(handle.getvalue, (), {}, open(file_pcl).read()) #_jeff_to_gct handle = StringIO.StringIO() X_gct = arrayio.convert(X_jeff, to_format=arrayio.gct_format) arrayio.gct_format.write(X_gct, handle) #print handle.getvalue(), #test(handle.getvalue, (), {}, open(file_gct).read()) #_gct_to_pcl handle = StringIO.StringIO() X_pcl = arrayio.convert(X_gct, to_format=arrayio.pcl_format) arrayio.pcl_format.write(X_pcl, handle) # _gct_to_pcl changes the gene id to "GeneID". Fix this so that # we can compare against the gold standard file. Everything else # should be the same. x = handle.getvalue() x = x.replace("GeneID", "Probe.Set.ID") test(print_fn, (x,), {}, open(file_pcl).read()) #_pcl_to_gct handle = StringIO.StringIO() X_pcl = arrayio.read(file_pcl, datatype=None) X_gct = arrayio.convert(X_pcl, to_format=arrayio.gct_format) arrayio.gct_format.write(X_gct, handle) test(handle.getvalue, (), {}, open(file_gct).read()) #_tdf_to_gct handle = StringIO.StringIO() X_pcl = arrayio.read(file_pcl, datatype=None) X_gct = arrayio.convert( X_pcl, from_format=arrayio.tab_delimited_format, to_format=arrayio.gct_format) arrayio.gct_format.write(X_gct, handle) test(handle.getvalue, (), {}, open(file_gct).read())