def merge_two_files(A_file, B_file, handle): """input two files and merge, write the output to handle""" import arrayio from genomicode import Matrix from genomicode import matrixlib M_A = arrayio.read(A_file) M_B = arrayio.read(B_file) assert arrayio.tab_delimited_format.is_matrix(M_A) assert arrayio.tab_delimited_format.is_matrix(M_B) [M_A, M_B] = matrixlib.align_rows(M_A, M_B) assert M_A.nrow() > 0, 'there is no common genes between two files' X = [] for i in range(M_A.dim()[0]): x = M_A._X[i] + M_B._X[i] X.append(x) row_names = M_A._row_names row_order = M_A._row_order col_names = {} for name in M_A._col_names: if name not in M_B._col_names: continue newsample_list = [] for sample in M_B._col_names[name]: if sample in M_A._col_names[name]: newsample = sample + '_2' else: newsample = sample newsample_list.append(newsample) #x = M_A._col_names[name] + M_B._col_names[name] x = M_A._col_names[name] + newsample_list col_names[name] = x M_c = Matrix.InMemoryMatrix(X, row_names, col_names, row_order) arrayio.tab_delimited_format.write(M_c, handle)
def summarize_predictions(file_layout): import zipfile import arrayio from genomicode import archive # Load the original dataset. Should be pathway x sample. M_data = arrayio.read(file_layout.DATASET) sample_names = M_data.col_names(arrayio.COL_ID) # Read the predictions. Will be a sample x probability matrix. M_predict = arrayio.read(file_layout.SELAP_PREDICT) assert M_predict.nrow() == len(sample_names) num_subgroups = M_predict.ncol() # Read the cluster names from the model. s2f = archive.unzip_dict(file_layout.SMODEL_ZIP) assert "clust.txt" in s2f zfile = zipfile.ZipFile(file_layout.SMODEL_ZIP) #x = [x.strip() for x in zfile.open(s2f["names.txt"]).readlines()] x = zfile.open(s2f["clust.txt"]).readlines() assert len(x) == num_subgroups, "I have %d subgroups but %d names." % ( num_subgroups, len(x)) clust_names = x # Save a subgroup x sample matrix. handle = open(file_layout.PREDICTIONS_PCL, 'w') x = ["Subgroup"] + sample_names x = arrayio.tab_delimited_format._clean_many(x) print >> handle, "\t".join(x) for i in range(num_subgroups): probs = M_predict.value(None, i) x = [clust_names[i]] + probs x = arrayio.tab_delimited_format._clean_many(map(str, x)) print >> handle, "\t".join(x) handle.close()
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import arrayio from genomicode import arraysetlib signal_node1, signal_node2 = antecedents M1 = arrayio.read(signal_node1.identifier) M2 = arrayio.read(signal_node2.identifier) samples1 = M1.col_names(arrayio.COL_ID) samples2 = M2.col_names(arrayio.COL_ID) # Make sure no duplicate sample names. samples = samples1 + samples2 seen = {} dups = {} for x in samples: if x in seen: dups[x] = 1 seen[x] = 1 dups = sorted(dups) assert not dups, "Duplicate sample names: %s" % ", ".join(dups) assert signal_node1.data.attributes["contents"] == "class0" assert signal_node2.data.attributes["contents"] == "class1" x1 = [0] * len(samples1) x2 = [1] * len(samples2) classes = x1 + x2 arraysetlib.write_cls_file(outfile, "class0", "class1", classes)
def make_model(selap_path, penalty, file_layout, matlab): import arrayio from genomicode import parselib from genomicode import archive from genomicode import selap print "Generating subgroups with penalty %d." % penalty x = selap.selap_make_raw(file_layout.SELAP_DATASET, penalty, matlab_bin=matlab, selap_path=selap_path, outpath=file_layout.SELAP) print x # Make sure SELAP ran correctly. msg = "Missing file. SELAPver3 did not run correctly." assert os.path.exists(file_layout.SELAP_MU), msg assert os.path.exists(file_layout.SELAP_SIG), msg assert os.path.exists(file_layout.SELAP_PROB), msg # Figure out the number of variables and the number of subgroups. X = arrayio.read(file_layout.SELAP_MU) num_vars, num_subgroups = X.dim() # Make the model file. opj = os.path.join path = file_layout.SMODEL_ZIP.replace(".zip", "") if not os.path.exists(path): os.mkdir(path) # Move over the files generated by SELAP. os.rename(file_layout.SELAP_MU, opj(path, "mu.txt")) os.rename(file_layout.SELAP_SIG, opj(path, "sig.txt")) os.rename(file_layout.SELAP_PROB, opj(path, "prob.txt")) # Generate the var.txt file. M = arrayio.read(file_layout.DATASET) assert M.nrow() == num_vars names = M.row_names(arrayio.ROW_ID) assert len(names) == num_vars handle = open(opj(path, "var.txt"), 'w') for x in names: print >> handle, x handle.close() # Generate the clust.txt file. # Set the names of the subgroups to a reasonable default. x = ["GROUP%s" % x for x in parselib.pretty_range(0, num_subgroups)] group_names = x handle = open(opj(path, "clust.txt"), 'w') for x in group_names: print >> handle, x handle.close() archive.zip_path(path, noclobber=False) assert os.path.exists(file_layout.SMODEL_ZIP) check_model(file_layout.SMODEL_ZIP)
def correlation_for_file(data_file, label_file, gene_num=True): """given data_file,label_file and the number of selected gene,return a list of select gene name""" # obtain the class label label, label_line = read_label_file.read(label_file) # read the data_file and caculate the correlation M = arrayio.read(data_file) p = correlation(M, label_line) # sort the correlation value and obtain the # list of the gene index after sorting c = sorted(p, reverse=True) sortlist = find_sorted_index(p, c) # obtain the gene name in the data_file f = open(data_file) a = f.read().split('\n') index = 0 # for pcl file startrows = 2 # for pcl file genelist = [] for i in range(startrows, len(a)): genelist.append(a[i].split('\t')[index]) # get a list of selected gene name if gene_num is not True: select_genelist = [genelist[i] for i in sortlist[0:gene_num]] else: select_genelist = [genelist[i] for i in sortlist] return select_genelist
def summarize_subgroups(outpath, num_analyses, penalties): # Count the number of subgroups for each penalty. import arrayio if not penalties: return penalty2subgroups = {} for penalty in penalties: fl = make_file_layout(outpath, num_analyses, penalty) M = arrayio.read(fl.GLOBAL_PREDICTIONS_PCL) num_subgroups = M.nrow() penalty2subgroups[penalty] = num_subgroups # Write output, with penalties sorted from big to small. penalties = sorted(penalty2subgroups) penalties.reverse() fl = make_file_layout(outpath, num_analyses, penalties[0]) handle = open(fl.SUMMARY, 'w') x = ["Penalty", "Num Subgroups"] print >> handle, "\t".join(x) for penalty in penalties: num_subgroups = penalty2subgroups[penalty] x = penalty, num_subgroups print >> handle, "\t".join(map(str, x)) handle.close()
def score_many(jobs, lock=None): # Return dict of (matrix_name, gs_name, index, sample) -> # GeneSetScore or GeneScore. import arrayio file2matrix = {} results = {} for x in jobs: (gs_name, pos_genes, neg_genes, matrix_name, matrix_file, any_matching_gene_sets) = x if matrix_file not in file2matrix: x = arrayio.read(matrix_file) file2matrix[matrix_file] = x MATRIX = file2matrix[matrix_file] assert not has_missing_values(MATRIX), \ "Matrix %s has missing values." % matrix_name if pos_genes or neg_genes: x = score_gene_set(gs_name, pos_genes, neg_genes, matrix_name, MATRIX, any_matching_gene_sets, lock=lock) else: assert not (pos_genes == [] and neg_genes == []), \ "Empty gene set: %s" % gs_name assert pos_genes is None, "Has pos genes: %s" % gs_name assert neg_genes is None, "Has neg genes: %s" % gs_name x = score_gene(gs_name, matrix_name, MATRIX) # TODO: should make sure we don't overwrite previous results. results.update(x) return results
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): """log the input file""" import math import arrayio from genomicode import filelib from genomicode import binreg signal_file = in_data.identifier filelib.assert_exists_nz(signal_file) M = arrayio.read(signal_file) assert not binreg.is_logged_array_data(M), 'the file is logged' # Change the matrix in place. X = M._X for i in range(len(X)): for j in range(len(X[i])): x = X[i][j] if x is None: continue x = float(x) if x < 1: x = 1 x = math.log(x, 2) X[i][j] = x M_c = arrayio.convert(M, to_format=arrayio.tab_delimited_format) handle = open(outfile, 'w') arrayio.tab_delimited_format.write(M_c, handle)
def read_matrices(filenames, cache=None): """Read a list of matrices and align them. filenames is a list of the matrix files to read. Returns a tuple where the first element is a list of the matrices read, and the second is the aligned matrix. cache is an optional dictionary of filename to matrix. This can be used to prevent re-reading of matrices. """ import copy import arrayio import filelib for filename in filenames: assert filelib.exists(filename), "File not found: %s" % filename # Load the files. DATA = [] for filename in filenames: if cache is not None and filename in cache: x = copy.deepcopy(cache[filename]) else: try: x = arrayio.read(filename) except (SystemError, KeyboardInterrupt, MemoryError), x: raise except Exception, x: # Can diagnose which file failed here. # raise raise Exception, "Problem reading %s: %s" % (repr(filename), str(x)) if cache is not None: cache[filename] = x
def read_gene_expression(filename): import os import arrayio assert os.path.exists(filename) M = arrayio.read(filename) return M
def format_firehose_rsem(filename, output): import arrayio HYB_REF = "Hybridization REF" GENE_ID = "gene_id" DATA = arrayio.read(filename) assert DATA._row_order == [HYB_REF] assert DATA._col_order == ["_SAMPLE_NAME", GENE_ID] genes = DATA.row_names(HYB_REF) gene_symbols = [None] * len(genes) gene_ids = [None] * len(genes) for i in range(len(genes)): x = genes[i].split("|") assert len(x) == 2 gene_symbol, gene_id = x if gene_symbol == "?": gene_symbol = "" gene_ids[i] = gene_id gene_symbols[i] = gene_symbol f = file(output, 'w') header = ["Gene ID", "Gene Symbol"] + DATA.col_names("_SAMPLE_NAME") f.write("\t".join(header) + '\n') for i in range(DATA.nrow()): x = [gene_ids[i], gene_symbols[i]] + DATA._X[i] assert len(x) == len(header) f.write("\t".join(map(str, x)) + '\n') f.close()
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib in_data = antecedents import arrayio f_out = file(outfile, 'w') M = arrayio.read(in_data.identifier) I_good = [] #get the percentage of gene filter percent = float(user_options['filter_value']) / 100 for i in range(M.dim()[0]): missing_count = 0 for j in range(M.dim()[1]): if M._X[i][j] in [None, 'NA']: missing_count = missing_count + 1 if float(missing_count) / M.dim()[1] < percent: I_good.append(i) M_c = M.matrix(I_good, None) arrayio.tab_delimited_format.write(M_c, f_out) f_out.close() assert filelib.exists_nz(outfile), ( 'the output file %s for gene_filter fails' % outfile )
def plot_line_keywd(filename, keyword, outfile): import arrayio from genomicode import mplgraph from genomicode import filelib M = arrayio.read(filename) header = M.row_names() label = M._col_names['_SAMPLE_NAME'] lines = [] data = [] legend_name = [] for i in range(M.dim()[0]): if M.row_names(header[1])[i] == keyword: data.append(M.slice()[i]) x = "%s (%s)" % (keyword, M.row_names(header[0])[i]) legend_name.append(x) assert len(data) > 0, 'cannot find the keyword %s in the file %s' % ( keyword, filename) for i in range(len(data)): line = [(j, data[i][j]) for j in range(len(data[i]))] lines.append(line) params = { "box_label": label, "legend": legend_name, "ylim_min": 0, "ylabel": "Signal", "left": 0.1, } fig = mplgraph.lineplot(*lines, **params) fig.savefig(outfile) assert filelib.exists_nz(outfile), 'the plot_line_keywd fails'
def plot_hyb_bar(filename, outfile): from genomicode import mplgraph from genomicode import filelib import math import numpy high = ['ILMN_2038770', 'ILMN_2038769'] med = ['ILMN_2038768', 'ILMN_2038771'] low = ['ILMN_1343050', 'ILMN_1343052'] high_data = [] med_data = [] low_data = [] import arrayio M = arrayio.read(filename) header = M.row_names() for i in range(M.dim()[0]): if not M.row_names(header[1])[i] == 'cy3_hyb': continue if M.row_names(header[0])[i] in high: high_data.extend(M.slice()[i]) if M.row_names(header[0])[i] in med: med_data.extend(M.slice()[i]) if M.row_names(header[0])[i] in low: low_data.extend(M.slice()[i]) mean = [numpy.mean(high_data), numpy.mean(med_data), numpy.mean(low_data)] flag = [math.isnan(i) for i in mean] assert True not in flag, 'input is not a control file' std = [numpy.std(high_data), numpy.std(med_data), numpy.std(low_data)] fig = mplgraph.barplot(mean, std, ylabel='Signal', box_label=['high', 'med', 'low']) fig.savefig(outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import math from Betsy import read_label_file from genomicode import jmath import arrayio data_node, cls_node = antecedents # obtain the class label label, label_line, second_line = read_label_file.read( cls_node.identifier) class_num = len(label) assert class_num == 2, 'the number of class is not 2' fc = 1 if 'group_fc_num' in user_options: fc = int(user_options['group_fc_num']) M = arrayio.read(data_node.identifier) first = M.slice(None, label[0][0]) second = M.slice(None, label[1][0]) #X = M.slice() I_good = [] for i in range(M.nrow()): fold_change = abs(jmath.mean(first[i]) - jmath.mean(second[i])) if fold_change >= math.log(fc, 2): I_good.append(i) assert I_good, 'there is no gene is significant in fold change with 2' f = file(outfile, 'w') M_c = M.matrix(I_good, None) arrayio.tab_delimited_format.write(M_c, f) f.close()
def t_test_for_file(data_file, label_file, gene_num=True): """given data_file,label_file and the number of selected gene,return a list of select gene name""" # obtain the class label label, label_line, second_line = read_label_file.read(label_file) class_num = len(label) assert class_num == 2, 'the number of class is not 2' # read the data_file and caculate the t-test M = arrayio.read(data_file) first = M.slice(None, label[0][0]) second = M.slice(None, label[1][0]) t, p = t_test(first, second) # sort the p value and obtain the list of the gene index after sorting c = sorted(p) sortlist = find_sorted_index(p, c) # obtain the gene name in the data_file f = open(data_file) a = f.read().split('\n') index = 0 # for pcl file startrows = 2 # for pcl file genelist = [] for i in range(startrows, len(a)): genelist.append(a[i].split('\t')[index]) #get a list of selected gene name if gene_num is not True: select_genelist = [genelist[i] for i in sortlist[0:gene_num]] else: select_genelist = [genelist[i] for i in sortlist] return select_genelist
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import shiftscalenorm import arrayio from Betsy import read_label_file from genomicode import filelib data_node, cls_node = antecedents if data_node and cls_node: result, label_line, second_line = read_label_file.read( cls_node.identifier) assert len( result) == 2, 'for shiftscale,there should be only 2 classes' M = arrayio.read(data_node.identifier) index1 = result[0][0] index2 = result[1][0] M_1 = M.matrix(None, index1) M_2 = M.matrix(None, index2) M_y = shiftscalenorm.normalize(M_1, M_2) for i in range(M_y.dim()[0]): for j in range(M_y.dim()[1]): if str(M_y._X[i][j]) == 'nan': M_y._X[i][j] = M_2._X[i][0] for j in range(M.nrow()): for i in range(len(index1)): M._X[j][index1[i]] = M_y._X[j][i] f = file(outfile, 'w') arrayio.tab_delimited_format.write(M, f) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for shiftscale fails' % outfile) return False
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess import arrayio from Betsy import module_utils from genomicode import filelib from genomicode import config in_data = antecedents bfrm_path = config.bfrmnorm bfrm_BIN = module_utils.which(bfrm_path) assert bfrm_BIN, 'cannot find the %s' % bfrm_path num_factor = 1 #num_factor = 10 if 'num_factors' in user_options.keys(): num_factor = int(user_options['num_factors']) assert num_factor >= 1, 'the num_factor should be >=1' # What is single_object? #M = arrayio.read(single_object.identifier) M = arrayio.read(in_data.identifier) col_num = M.ncol() assert num_factor <= col_num, ( 'the num_factor should be less than %d' % col_num) tmp = 'tmp_dir' command = [ 'python', bfrm_BIN, in_data.identifier, '-f', str(num_factor), '-o', tmp ] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(tmp), ( 'the output dir %s for bfrm_normalize fails' % tmp) assert filelib.exists_nz(os.path.join(tmp, 'normalized.gct')), ( 'the output gct file for bfrm_normalize fails') out = os.path.join(tmp, 'normalized.gct') M = arrayio.read(out) M_new = arrayio.convert(M, to_format=arrayio.pcl_format) f = file(outfile, 'w') arrayio.tab_delimited_format.write(M_new, f) f.close()
def run_bfrm_project(file_layout, bfrm_path, matlab_bin): import arrayio from genomicode import bfrm from genomicode import matlab param_file = "parameters.txt" model = bfrm.read_clean_model(file_layout.BFRM_MODEL, param_file=param_file) num_factors = len(model["FACTOR_O"]) assert num_factors, "No latent factors in the BFRM model." x = "Projecting %d latent factors onto data set." % num_factors if num_factors == 1: x = x.replace("factors", "factor") print x DATA = arrayio.read(file_layout.DATASET) bfrm_path = bfrm.find_bfrm_project(bfrm_path) assert bfrm_path is not None, "I could not find BFRM_project." bfrm_path = os.path.realpath(bfrm_path) # Write out the dataset and probe IDs. write_bfrm_dataset(file_layout.BFRM_DATASET, DATA) write_sample_probe_ids(file_layout.BFRM_SPROBE_IDS, DATA) # Write the BFRM model files. write_bfrm_files(file_layout.BFRM, file_layout.BFRM_MODEL) # Make sure some of the probes are the same. pid = [x.strip() for x in open(file_layout.BFRM_PROBE_IDS)] pid = [pid[i] for i in model["VariablesIn"]] spid = [x.strip() for x in open(file_layout.BFRM_SPROBE_IDS)] pid = [x.lower() for x in pid] spid = [x.lower() for x in spid] intersect = [x for x in pid if x in spid] assert intersect, "No common probes between model and data set." if len(intersect) < len(pid): x = "Warning: model contains %d probe IDs, but only matched " + \ "%d in data set." print x % (len(pid), len(intersect)) # Run the matlab script. lines = [] w = lines.append w("addpath '%s';\n" % bfrm_path) w("addpath '%s/bfrm';\n" % bfrm_path) w("y = load('%s');\n" % file_layout.BFRM_DATASET) w("probeidsSmp = readWordlist('%s');\n" % file_layout.BFRM_SPROBE_IDS) w("[af Y sampleids] = getFacScores('%s/', y, probeidsSmp);" % file_layout.BFRM) w("save('%s', 'af', '-ASCII', '-TABS');\n" % file_layout.BFRM_AF) w("save('%s', 'Y', '-ASCII', '-TABS');\n" % file_layout.BFRM_Y) script = "".join(lines) x = matlab.run(script, matlab_bin=matlab_bin, working_path=file_layout.OUTPATH) print x sys.stdout.flush()
def summarize_heatmap(python, arrayplot, cluster, file_layout, libpath=[]): import arrayio from genomicode import graphlib M_predict = arrayio.read(file_layout.PREDICTIONS_PCL) nrow, ncol = M_predict.dim() # Set the size of the plot. x = graphlib.find_wide_heatmap_size(nrow, ncol, min_box_width=12, min_box_height=12, height_width_ratio=nrow * 1.618 / ncol) xpix, ypix = x x = graphlib.plot_heatmap(file_layout.PREDICTIONS_PCL, file_layout.PREDICTIONS_PNG, xpix, ypix, color="bild", show_colorbar=True, show_grid=True, scale=-0.5, gain=1.5, no_autoscale=True, gene_label=True, array_label=True, cluster_arrays=True, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) print x # If arrayplot generated predictions.cdt file, remove it. # Actually, don't remove it. It might be required if people want # to plot it themselves with other plotting software. Maybe can # move it to the attic. There may also be a predictions.atr file. #if os.path.exists(file_layout.PREDICTIONS_CDT): # os.unlink(file_layout.PREDICTIONS_CDT) # Clean up some of the cluster files. if os.path.exists(file_layout.PREDICTIONS_CDT): src = file_layout.PREDICTIONS_CDT x = os.path.split(file_layout.PREDICTIONS_CDT)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst) if os.path.exists(file_layout.PREDICTIONS_ATR): src = file_layout.PREDICTIONS_ATR x = os.path.split(file_layout.PREDICTIONS_ATR)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst) # Make sure the signature was generated correctly. An error could # mean that arrayplot.py or cluster is missing. assert os.path.exists(file_layout.PREDICTIONS_PNG), \ "Failed to make predictions heatmap."
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): """check an input file is xls or xlsx format""" import arrayio in_filename = in_data.identifier # Why is this necessary? #try: # x = userfile._unhash_storefile(in_data.identifier) # real_name = x[1] #except: # pass #if (in_data.identifier.endswith('.gz') or in_filename.endswith('.gz')): # unzip_file = module_utils.gunzip(in_data.identifier) #else: # unzip_file = in_data.identifier ## M = None ## xls_file = None ## txt_file = unzip_file ## try: ## xlrd.open_workbook(unzip_file) ## xls_file = 'tmp.xls' ## # XLRDError? Is this a bug? This is not the way to catch exception. ## except Exception, XLRDError: ## try: ## # Test this. book not used? ## book = openpyxl.load_workbook(unzip_file) ## xls_file = 'tmp.xlsx' ## except Exception, InvalidFileException: ## xls_file = None ## except (SystemError, MemoryError, KeyError), x: ## raise ## if xls_file: ## shutil.copyfile(unzip_file, xls_file) ## xls2txt_path = config.xls2txt ## xls2txt_BIN = module_utils.which(xls2txt_path) ## assert xls2txt_BIN, 'cannot find the %s' % xls2txt_path ## f = file('tmp1.txt', 'w') ## command = ['python', xls2txt_BIN, xls_file] ## process = subprocess.Popen(command, ## shell=False, ## stdout=f, ## stderr=subprocess.PIPE) ## error_message = process.communicate()[1] ## if error_message: ## raise ValueError(error_message) ## os.remove(xls_file) ## f.close() ## txt_file = 'tmp1.txt' to_format = arrayio.tdf MATRIX = arrayio.read(in_filename) MATRIX_c = arrayio.convert(MATRIX, to_format=to_format) to_format.write(MATRIX_c, open(outfile, 'w'))
def plot_line_keywds(filename, keywords, outfile): import arrayio from genomicode import mplgraph from genomicode import filelib M = arrayio.read(filename) header = M.row_names() label = M._col_names['_SAMPLE_NAME'] outfiles = [] for keyword in keywords: out = keyword + '.png' lines = [] data = [] legend_name = [] for i in range(M.dim()[0]): if M.row_names(header[1])[i] == keyword: data.append(M.slice()[i]) legend_name.append(M.row_names(header[0])[i]) assert len(data) > 0, 'cannot find the keywords %s in the file %s' % ( keywords, filename) for i in range(len(data)): line = [(j, data[i][j]) for j in range(len(data[i]))] lines.append(line) params = { "box_label": label, "legend": legend_name, "ylim_min": 0, "ylabel": keyword, "left": 0.1, } fig = mplgraph.lineplot(*lines, **params) fig.savefig(out) outfiles.append(out) import Image img_w_list = [] img_h_list = [] imgs = [] for i in range(len(outfiles)): img = Image.open(outfiles[i], 'r') img_w, img_h = img.size img_w_list.append(img_w) img_h_list.append(img_h) imgs.append(img) total_w = max(img_w_list) + 30 total_h = sum(img_h_list) + 10 background = Image.new('RGBA', (total_w, total_h), (255, 255, 255, 255)) bg_w, bg_h = background.size offset_w = (bg_w - max(img_w_list)) / 2 offset_h_list = [] for i in range(len(img_h_list)): offset_h = bg_h - sum(img_h_list[i:]) offset_h_list.append(offset_h) for img, offset_h in zip(imgs, offset_h_list): background.paste(img, (offset_w, offset_h)) background.save(outfile) assert filelib.exists_nz(outfile), 'the plot_line_keywds fails'
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import mplgraph import arrayio from genomicode import jmath from genomicode import arrayplatformlib from genomicode import filelib in_data = antecedents M = arrayio.read(in_data.identifier) platforms = arrayplatformlib.identify_all_platforms_of_matrix(M) id_ = platforms[0][0] platform = platforms[0][1] if platform: if platform in [ 'HumanHT_12', 'MouseRef_8', 'HumanHT_12_control', 'MouseRef_8_control', 'entrez_ID_human', 'entrez_ID_mouse', 'entrez_symbol_human', 'entrez_symbol_mouse' ]: import matplotlib.pyplot as plt plt.clf() plt.plot([0, 0, 0, 0]) plt.title('no AFFX plot can be generated') plt.savefig(outfile) else: M = arrayio.read(in_data.identifier) label = M._col_names['_SAMPLE_NAME'] row_names = M._row_names[id_] index = [] for i, name in enumerate(row_names): if name.startswith('AFFX-'): index.append(i) M_new = M.matrix(index) new = M_new.slice() a = jmath.mean_matrix(new, byrow=None) line = [(i, a[i]) for i in range(len(a))] f = mplgraph.lineplot(line, ylim_min=0, ylabel='Gene Expression Value', box_label=label) f.savefig(outfile) assert filelib.exists_nz(outfile), ( 'the output file %s for plot_affy_affx_line fails' % outfile)
def convert_to_same_platform(filename1, filename2, platform=None): import arrayio import subprocess from genomicode import config from genomicode import arrayplatformlib from genomicode import filelib M1 = arrayio.read(filename1) platform1 = arrayplatformlib.identify_platform_of_matrix(M1) M2 = arrayio.read(filename2) platform2 = arrayplatformlib.identify_platform_of_matrix(M2) if platform1 == platform2: return filename1, filename2 Annot_path = config.annotate_matrix Annot_BIN = filelib.which(Annot_path) assert Annot_BIN, 'cannot find the %s' % Annot_path if platform1 == platform: filename = filename2 newfilename1 = filename1 newfilename2 = 'tmp' elif platform2 == platform: filename = filename1 newfilename1 = 'tmp' newfilename2 = filename2 if platform: command = [ 'python', Annot_BIN, '-f', filename, '-o', 'tmp', "--platform", platform ] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) #assert module_utils.exists_nz('tmp'), ( # 'the platform conversion fails') assert filelib.exists_nz('tmp'), 'the platform conversion fails' return newfilename1, newfilename2
def set_out_attributes(self, antecedents, out_attributes): import arrayio from genomicode import binreg attrs = out_attributes.copy() M = arrayio.read(antecedents.identifier) if binreg.is_logged_array_data(M): attrs['logged'] = 'yes' else: attrs['logged'] = 'no' return attrs
def summarize_filtered_genes(file_layout): # Select the <NUM_FILTERED_GENES> genes that vary most by variance. import arrayio from genomicode import matrixlib from genomicode import pcalib DATA_orig = arrayio.read(file_layout.DS_PROC) DATA_final = arrayio.read(file_layout.DS_FINAL) if not matrixlib.are_rows_aligned(DATA_orig, DATA_final): assert False, matrixlib.describe_unaligned_rows(DATA_orig, DATA_final) # Select the genes with the greatest variance. I = pcalib.select_genes_var(DATA_orig._X, NUM_FILTERED_GENES) DATA_orig = DATA_orig.matrix(I, None) DATA_final = DATA_final.matrix(I, None) arrayio.gct_format.write(DATA_orig, open(file_layout.DS_PROC_FILTERED, 'w')) arrayio.gct_format.write(DATA_final, open(file_layout.DS_FINAL_FILTERED, 'w'))
def set_out_attributes(self, antecedents, out_attributes): import arrayio new_parameters = out_attributes.copy() M = arrayio.read(antecedents.identifier) if is_gene_normalize_variance(M): new_parameters['gene_normalize'] = 'variance' elif is_gene_normalize_ss(M): new_parameters['gene_normalize'] = 'sum_of_squares' else: new_parameters['gene_normalize'] = 'no' return new_parameters
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib import os import arrayio from genomicode import config from genomicode import arrayplatformlib in_data = antecedents mapfile = config.HumanHT_12_to_HG_u133_Plus_2 assert os.path.exists(mapfile), 'mapping file %s does not exist' % mapfile result = [] for d in filelib.read_row(mapfile, header=True): if int(d.Distance) <= 1000 and d.Match == 'Best for Both': result.append((d.Affymetrix_Probe_Set_ID, d.Illumina_Probe_ID)) M = arrayio.read(in_data.identifier) #platform_list = arrayplatformlib.identify_all_platforms_of_matrix(M) platform_list = arrayplatformlib.score_all_platforms_of_matrix(M) illu_id = None probe_id = None for platform in platform_list: if 'HumanHT_12' in platform: illu_id = M._row_names[platform[0]] if 'HG_U133_Plus_2' in platform: probe_id = M._row_names[platform[0]] if not illu_id or not probe_id: return None index = [] for i in range(M.nrow()): if (probe_id[i], illu_id[i]) in result: index.append(i) if len(index) > 0: M_new = M.matrix(index, None) f = file(outfile, 'w') arrayio.tab_delimited_format.write(M_new, f) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for best_match_both fails' % outfile ) else: return None
def set_out_attributes(self, antecedents, out_attributes): import arrayio new_parameters = out_attributes.copy() M = arrayio.read(antecedents.identifier) if is_gene_center_mean(M): new_parameters['gene_center'] = 'mean' elif is_gene_center_median(M): new_parameters['gene_center'] = 'median' else: new_parameters['gene_center'] = 'no' return new_parameters
def is_missing(identifier): import arrayio M = arrayio.read(identifier) has_missing = False for i in range(M.dim()[0]): for j in range(M.dim()[1]): if M._X[i][j] is None: has_missing = True break if has_missing: break return has_missing