def _res_to_gct(X): # Will lose the CALL information. # Will lose the SCALE_FACTOR. # Will lose the column DESCRIPTIONS. from genomicode import Matrix assert res_format.is_matrix(X) # Figure out the annotation names for the row name and description. acc_header, desc_header, call_header = X.row_names() if X._synonyms[ROW_ID] != acc_header: acc_header, desc_header = desc_header, acc_header assert X._synonyms[ROW_ID] == acc_header row_order = ["Name", "Description"] col_order = [X._col_order[0]] row_names = {} col_names = {} synonyms = {} row_names["Name"] = X.row_names(acc_header) row_names["Description"] = X.row_names(desc_header) col_names[col_order[0]] = X.col_names(col_order[0]) synonyms[ROW_ID] = "Name" synonyms[COL_ID] = col_order[0] x = Matrix.InMemoryMatrix(X._X, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) #x = Matrix.add_synonyms(x, synonyms) #gct_format.is_matrix(x); print gct_format.DIAGNOSIS assert gct_format.is_matrix(x) return x
def merge_two_files(A_file, B_file, handle): """input two files and merge, write the output to handle""" import arrayio from genomicode import Matrix from genomicode import matrixlib M_A = arrayio.read(A_file) M_B = arrayio.read(B_file) assert arrayio.tab_delimited_format.is_matrix(M_A) assert arrayio.tab_delimited_format.is_matrix(M_B) [M_A, M_B] = matrixlib.align_rows(M_A, M_B) assert M_A.nrow() > 0, 'there is no common genes between two files' X = [] for i in range(M_A.dim()[0]): x = M_A._X[i] + M_B._X[i] X.append(x) row_names = M_A._row_names row_order = M_A._row_order col_names = {} for name in M_A._col_names: if name not in M_B._col_names: continue newsample_list = [] for sample in M_B._col_names[name]: if sample in M_A._col_names[name]: newsample = sample + '_2' else: newsample = sample newsample_list.append(newsample) #x = M_A._col_names[name] + M_B._col_names[name] x = M_A._col_names[name] + newsample_list col_names[name] = x M_c = Matrix.InMemoryMatrix(X, row_names, col_names, row_order) arrayio.tab_delimited_format.write(M_c, handle)
def _jeff_to_pcl(X): # Will lose a lot of annotations. from genomicode import Matrix assert jeffs_format.is_matrix(X) assert len(X.col_names()) == 1 assert X._col_order and X._col_order[0] == tab_delimited_format.SAMPLE_NAME row_order = ["Probe.Set.ID", "NAME"] col_order = X._col_order[:] row_names = {} col_names = X._col_names.copy() synonyms = {} row_names["Probe.Set.ID"] = X._row_names["Probe.Set.ID"] row_names["NAME"] = X._row_names["Gene.Symbol"] synonyms[ROW_ID] = "Probe.Set.ID" synonyms[COL_ID] = col_order[0] x = Matrix.InMemoryMatrix(X._X, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) #x = Matrix.add_synonyms(x, synonyms) assert pcl_format.is_matrix(x) return x
def _gct_to_pcl(X): from genomicode import Matrix assert gct_format.is_matrix(X) assert len(X.col_names()) == 1 assert X._col_order and X._col_order[0] == tab_delimited_format.SAMPLE_NAME assert len(X.row_names()) == 2 name, desc = X.row_names() row_order = ["GeneID", "NAME"] col_order = X._col_order[:] row_names = {} col_names = X._col_names.copy() synonyms = {} row_names["GeneID"] = X._row_names[name] row_names["NAME"] = X._row_names[desc] synonyms[ROW_ID] = "GeneID" synonyms[COL_ID] = col_order[0] x = Matrix.InMemoryMatrix(X._X, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) #x = Matrix.add_synonyms(x, synonyms) assert pcl_format.is_matrix(x) return x
def read_geneset_scores(filename): # Read the output from score_geneset.py and return a Matrix # object. import os from genomicode import jmath from genomicode import filelib from genomicode import Matrix from arrayio import const from arrayio import tab_delimited_format as tdf assert os.path.exists(filename) matrix = [x for x in filelib.read_cols(filename)] matrix = jmath.transpose(matrix) # Only want the scores. Get rid of the direction, pvalue, and # significance lines. # Columns: # SAMPLE # FILE # [Score ...] # [Direction ...] " direction" # [p value ...] " pvalue" # [significant ...] " significant" assert matrix i = 0 while i < len(matrix): assert matrix[i] metadata = False if matrix[i][0].endswith(" direction"): metadata = True elif matrix[i][0].endswith(" pvalue"): metadata = True elif matrix[i][0].endswith(" significant"): metadata = True if not metadata: i += 1 continue del matrix[i] # BUG: Need more checks on size and format of matrix. col_names = {} sample_row = 0 if matrix[1][0].upper() == "SAMPLE": sample_row = 1 col_names[tdf.SAMPLE_NAME] = matrix[sample_row][1:] row_names = {} row_names['geneset'] = [] synonyms = {} synonyms[const.COL_ID] = tdf.SAMPLE_NAME data = [] for line in matrix[2:]: single_data = [jmath.safe_float(i) for i in line[1:]] data.append(single_data) row_names['geneset'].append(line[0]) M = Matrix.InMemoryMatrix(data, row_names=row_names, col_names=col_names, synonyms=synonyms) return M
def _tdf_to_gct(X): from genomicode import Matrix from genomicode import parselib assert tab_delimited_format.is_matrix(X) assert len(X.col_names()) >= 1 assert X._col_order and X._col_order[0] == tab_delimited_format.SAMPLE_NAME name_header = "NAME" desc_header = "DESCRIPTION" # Make up default names. name = ["NAME%s" % x for x in parselib.pretty_range(0, X.nrow())] desc = ["DESC%s" % x for x in parselib.pretty_range(0, X.nrow())] # Try to find better names. if not X.row_names(): pass elif len(X.row_names()) == 1: # Only 1 header, so use that for the name. name = X.row_names(X.row_names()[0]) else: # Use the first two columns for the name and description. name_i, desc_i = 0, 1 # See if there is a ROW_ID set. If there is, use that for NAME. if ROW_ID in X._synonyms: name_i = X.row_names().index(X._synonyms[ROW_ID]) if name_i == desc_i: # name_i used to be 0, and desc_i is not 0. assert desc_i != 0 desc_i = 0 assert name_i != desc_i name = X.row_names(X.row_names()[name_i]) desc = X.row_names(X.row_names()[desc_i]) row_order = [name_header, desc_header] col_order = [X._col_order[0]] row_names = {} col_names = {} synonyms = {} row_names[name_header] = name row_names[desc_header] = desc col_names[tab_delimited_format.SAMPLE_NAME] = X._col_names[X._col_order[0]] synonyms[ROW_ID] = name_header synonyms[COL_ID] = col_order[0] x = Matrix.InMemoryMatrix(X._X, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) #x = Matrix.add_synonyms(x, synonyms) assert gct_format.is_matrix(x) return x
def _convert_to_pcl(MATRIX): # Convert the matrix to PCL format. # Row names <ID> NAME # Col names import arrayio from genomicode import Matrix # Select from the row names an ID and a NAME. id_name = _choose_gene_id(MATRIX) name_name = _choose_gene_label(MATRIX) # Make sure there aren't any blank gene IDs, or cluster will # complain. Also, make sure they are unique. seen = {} for id_ in MATRIX.row_names(id_name): id_ = id_.strip() assert id_, "Missing gene IDs (header %s)." % id_name assert id_ not in seen, "Duplicate gene ID %s." % id_ seen[id_] = 1 # Should not use "GID" as column name for PCL file. When # clustering, cluster will add another "GID" column, and then # there will be two columns called "GID". Rename this to # something else, if necessary. pretty_id_name = id_name if pretty_id_name == "GID": pretty_id_name = "GID.OLD" if pretty_id_name == "NAME": # GCT files uses "NAME" for ID, which conflicts with PCL definition. pretty_id_name = "ID.NAME" pretty_name_name = "NAME" SAMPLE_NAME = arrayio.tab_delimited_format.SAMPLE_NAME row_order = [pretty_id_name, pretty_name_name] col_order = [SAMPLE_NAME] row_names = {} col_names = {} synonyms = {} row_names[pretty_id_name] = MATRIX.row_names(id_name) row_names[pretty_name_name] = MATRIX.row_names(name_name) col_names[SAMPLE_NAME] = MATRIX.col_names(arrayio.COL_ID) synonyms[arrayio.ROW_ID] = pretty_id_name synonyms[arrayio.COL_ID] = SAMPLE_NAME pcl_matrix = Matrix.InMemoryMatrix( MATRIX.slice(), row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) #pcl_matrix = Matrix.add_synonyms(x, synonyms) assert arrayio.pcl_format.is_matrix(pcl_matrix) return pcl_matrix
def format_rsem_isoforms(txt_file, outfile): import arrayio from genomicode import arrayplatformlib M = arrayio.read(txt_file) # detect platform x = arrayplatformlib.score_matrix(M, min_score=0.8) assert x, "Cannot identify platform." header, platform = x.header, x.platform_name probe_ids = M.row_names(header) #if kg5, convert to kg7 if platform == 'UCSC_human_hg19_kg5': new_platform = 'UCSC_human_hg19_kg7' kg7_ids = arrayannot.convert_probe_ids(probe_ids, new_platform) kg7_header = 'Hybridization REF kg7' M = make_matrix_new_ids(M, kg7_ids, kg7_header, 1) probe_ids = M.row_names(kg7_header) # add LocusLink ids LocusLink_ids = arrayannot.convert_probe_ids(probe_ids, 'Entrez_ID_human') gene_symbol_ids = arrayannot.convert_probe_ids(probe_ids, 'Entrez_Symbol_human') newMatrix = make_matrix_new_ids(M, LocusLink_ids, 'Entrez_ID_human', 2) newMatrix = make_matrix_new_ids(newMatrix, gene_symbol_ids, 'Entrez_Symbol_human', 3) #get rid of scaled_estimate assert 'scaled_estimate' in newMatrix._col_names['isoform_id'] assert 'raw_count' in newMatrix._col_names['isoform_id'] col_names = {} col_names['_SAMPLE_NAME'] = [ newMatrix._col_names['_SAMPLE_NAME'][i] for i in range(len(newMatrix._col_names['_SAMPLE_NAME'])) if not i % 2 ] row_names = newMatrix._row_names.copy() row_order = newMatrix._row_order[:] col_order = newMatrix._col_order[:] col_order.remove('isoform_id') synonyms = newMatrix._synonyms.copy() X = [] for line in newMatrix._X: line = [line[i] for i in range(len(line)) if not i % 2] X.append(line) x = Matrix.InMemoryMatrix(X, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) f = file(outfile, 'w') arrayio.tab_delimited_format.write(x, f) f.close()
def _pcl_to_gct(X): # Will lose the column annotations. from genomicode import Matrix from genomicode import parselib assert pcl_format.is_matrix(X) # PCL format can have multiple column annotations, e.g. EWEIGHT. #assert len(X.col_names()) == 1 assert X._col_order and X._col_order[0] == tab_delimited_format.SAMPLE_NAME assert len(X.row_names()) > 0 # Figure out the annotation names for the row name and description. if len(X.row_names()) == 1: row_name, row_desc = X.row_names()[0], None else: row_name, row_desc = X.row_names()[:2] row_order = ["NAME", "DESCRIPTION"] #col_order = X._col_order[:] col_order = [tab_delimited_format.SAMPLE_NAME] row_names = {} col_names = {} synonyms = {} row_names["NAME"] = X._row_names[row_name] if row_desc: row_names["DESCRIPTION"] = X._row_names[row_desc] else: # Make up default row names. x = ["DESC%s" % x for x in parselib.pretty_range(0, X.nrow())] row_names["DESCRIPTION"] = x col_names[col_order[0]] = X._col_names[tab_delimited_format.SAMPLE_NAME] synonyms[ROW_ID] = "NAME" synonyms[COL_ID] = col_order[0] x = Matrix.InMemoryMatrix(X._X, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) #x = Matrix.add_synonyms(x, synonyms) #gct_format.is_matrix(x) #print gct_format.DIAGNOSIS assert gct_format.is_matrix(x) return x
def make_matrix_new_ids(DATA, output_ids, header, index): # Make a matrix with the new IDs. X = DATA._X row_names = DATA._row_names.copy() row_order = DATA._row_order[:] col_names = DATA._col_names.copy() col_order = DATA._col_order[:] synonyms = DATA._synonyms.copy() row_order.insert(index, header) row_names[header] = output_ids # Write the outfile. x = Matrix.InMemoryMatrix(X, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) return x
def _res_to_pcl(X): # Will lose the CALL information. # Will lose the SCALE_FACTOR. # Will lose the column DESCRIPTIONS. from genomicode import Matrix assert res_format.is_matrix(X) # Figure out the annotation names for the row name and description. acc_header, desc_header, call_header = X.row_names() if X._synonyms[ROW_ID] != acc_header: acc_header, desc_header = desc_header, acc_header assert X._synonyms[ROW_ID] == acc_header # Make sure the names don't conflict. row_name = acc_header if row_name == "NAME": row_name = "ORIGINAL_NAME" row_order = [row_name, "NAME"] col_order = [X._col_order[0]] row_names = {} col_names = {} synonyms = {} row_names[row_name] = X.row_names(acc_header) row_names["NAME"] = X.row_names(desc_header) col_names[col_order[0]] = X.col_names(col_order[0]) synonyms[ROW_ID] = row_name synonyms[COL_ID] = col_order[0] x = Matrix.InMemoryMatrix(X._X, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) #x = Matrix.add_synonyms(x, synonyms) #pcl_format.is_matrix(x); print pcl_format.DIAGNOSIS assert pcl_format.is_matrix(x) return x
def _res_to_tdf(X): from genomicode import Matrix assert res_format.is_matrix(X) # Figure out the annotation names for the row name and description. acc_header, desc_header, call_header = X.row_names() if X._synonyms[ROW_ID] != acc_header: acc_header, desc_header = desc_header, acc_header assert X._synonyms[ROW_ID] == acc_header row_order = [acc_header, desc_header, call_header] col_order = X._col_order row_names = X._row_names col_names = X._col_names synonyms = X._synonyms x = Matrix.InMemoryMatrix(X._X, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) assert tab_delimited_format.is_matrix(x) return x
def _tdf_to_pcl(X): from genomicode import Matrix from genomicode import parselib assert tab_delimited_format.is_matrix(X) assert len(X.col_names()) >= 1 assert X._col_order and X._col_order[0] == tab_delimited_format.SAMPLE_NAME # Make up default headers and names. id_header = "GENE_ID" name_header = "NAME" geneid = ["GENE%s" % x for x in parselib.pretty_range(0, X.nrow())] name = None # Try to find better names. if not X.row_names(): pass elif len(X.row_names()) == 1: # Only 1 header, so use that for the gene ID. id_header = X.row_names()[0] geneid = X.row_names(id_header) else: # Use the first two columns for the ID and name. geneid_i, name_i = 0, 1 # See if there is a ROW_ID set. If there is, use that for NAME. if ROW_ID in X._synonyms: id_header = X._synonyms[ROW_ID] geneid_i = X.row_names().index(id_header) if geneid_i == name_i: # geneid_i used to be 0, and name_i is not 0. name_i = 0 assert geneid_i != name_i geneid = X.row_names(X.row_names()[geneid_i]) name = X.row_names(X.row_names()[name_i]) if id_header == name_header: id_header = "GENE_ID" # assume this is not the name_header assert id_header != name_header row_order = [id_header] if name is not None: row_order = [id_header, name_header] col_order = [X._col_order[0]] row_names = {} col_names = {} synonyms = {} row_names[id_header] = geneid if name_header in row_order: row_names[name_header] = name col_names[tab_delimited_format.SAMPLE_NAME] = X._col_names[X._col_order[0]] synonyms[ROW_ID] = id_header synonyms[COL_ID] = col_order[0] x = Matrix.InMemoryMatrix(X._X, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) #x = Matrix.add_synonyms(x, synonyms) assert pcl_format.is_matrix(x) return x
def summarize_factor_scores(file_layout, factor_cutoff, python, arrayplot, cluster, libpath): import arrayio from genomicode import Matrix from genomicode import graphlib DATA = arrayio.read(file_layout.DATASET) model = _read_model(file_layout, factor_cutoff) F = model["F"] # If there were no factors, then don't generate any files. if not F.nrow(): print "Not generating factor scores file. No factors detected." return assert F.ncol() == DATA.ncol() # Read the factor names. x = [x.strip() for x in open(file_layout.BFRM_FACTOR_IDS)] factor_names = x assert len(factor_names) == F.nrow() # The factor names are in the same order as the data files. Sort # them so they'll be in the same order as the clean model. factor_names = [factor_names[i] for i in model["FACTOR_O"]] SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME row_names = {} col_names = {} row_names["xID"] = factor_names col_names[SAMPLE_NAME] = DATA.col_names(SAMPLE_NAME) M = Matrix.InMemoryMatrix(F._X, row_names, col_names) arrayio.pcl_format.write(M, file_layout.FACTOR_SCORES) # Make the heatmap. x = graphlib.find_wide_heatmap_size(M.nrow(), M.ncol(), min_box_height=10, min_box_width=10, max_total_height=768, max_total_width=1024) xpix, ypix = x ypix = min(ypix, xpix * 4) # TODO: Don't show array label if there are too many samples. x = graphlib.plot_heatmap(file_layout.FACTOR_SCORES, file_layout.FACTOR_SCORES_PNG, xpix, ypix, color="bild", show_colorbar=True, show_grid=True, gene_label=True, cluster_genes=True, gene_center="mean", gene_normalize="var", array_label=True, cluster_arrays=True, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) # Clean up some of the cluster files. files = [ file_layout.FACTOR_CDT, file_layout.FACTOR_ATR, file_layout.FACTOR_GTR ] for filename in files: if not os.path.exists(filename): continue src = filename x = os.path.split(filename)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst)
def summarize_gene_factor_probs(file_layout, factor_cutoff, python, arrayplot, cluster, libpath): import arrayio from genomicode import Matrix from genomicode import graphlib model = _read_model(file_layout, factor_cutoff) PostPib = model["PostPib"] ExternalProb = model.get("ExternalProb") # If there were no factors, then don't generate any files. if not PostPib.ncol(): print "Not generating factor probabilities file. No factors detected." return # Pull out the gene names. DATA = arrayio.read(file_layout.DATASET) DATA_m = DATA.matrix(model["VariablesIn"], None) # Pull out the factor names. assert os.path.exists(file_layout.FACTOR_SCORES) D_scores = arrayio.read(file_layout.FACTOR_SCORES) factor_names = D_scores.row_names(arrayio.ROW_ID) assert len(factor_names) == PostPib.ncol() # Write the probabilities for the genes in the model. SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME row_names = {} col_names = {} row_order = DATA_m.row_names() for x in row_order: row_names[x] = DATA_m.row_names(x) col_names[SAMPLE_NAME] = factor_names M = Matrix.InMemoryMatrix(PostPib._X, row_names, col_names, row_order) arrayio.tab_delimited_format.write(M, file_layout.FACTOR_PROBS) # Make heatmap of the factor probs. #x = graphlib.find_tall_heatmap_size( # M.nrow(), M.ncol(), min_box_width=10, max_total_height=1000, # max_total_width=1000) xpix, ypix = 20, 20 x = graphlib.plot_heatmap( file_layout.FACTOR_PROBS, file_layout.FACTOR_PROBS_PNG, xpix, ypix, color="red", #show_colorbar=True, show_grid=True, array_label=True, gene_label=True, scale=-0.5, gain=2.0, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) # If exists, write the probabilities for all genes in the data set. if not ExternalProb: return row_names = {} col_names = {} row_order = DATA.row_names() for x in row_order: row_names[x] = DATA.row_names(x) col_names[SAMPLE_NAME] = factor_names M = Matrix.InMemoryMatrix(ExternalProb._X, row_names, col_names, row_order) arrayio.tab_delimited_format.write(M, file_layout.FACTOR_PROBS_ALL)
def summarize_factor_scores(file_layout, python, arrayplot, cluster, libpath): import zipfile import arrayio from genomicode import Matrix from genomicode import jmath from genomicode import archive from genomicode import graphlib from genomicode import bfrm DATA = arrayio.read(file_layout.DATASET) param_file = "parameters.txt" model = bfrm.read_clean_model(file_layout.BFRM_MODEL, param_file=param_file) num_factors = model["F"].nrow() # Load the factor names. assert zipfile.is_zipfile(file_layout.BFRM_MODEL) s2f = archive.unzip_dict(file_layout.BFRM_MODEL) assert "factorids.txt" in s2f, "Missing: factorids.txt" zfile = zipfile.ZipFile(file_layout.BFRM_MODEL) factor_names = [x.strip() for x in zfile.open(s2f["factorids.txt"])] assert len(factor_names) == num_factors # sample x factor matrix F = arrayio.read(file_layout.BFRM_AF) assert F.nrow() == DATA.ncol() F_X = jmath.transpose(F._X) # F_X contains all factors, including intercept and design. # Remove all but the latent factors. F_X = F_X[-num_factors:] # Sort the factors so they'll be in the same order as the clean # model. assert len(F_X) == len(model["FACTOR_O"]) F_X = [F_X[i] for i in model["FACTOR_O"]] factor_names = [factor_names[i] for i in model["FACTOR_O"]] # Write out the projected factor scores. SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME row_names = {} col_names = {} row_names["xID"] = factor_names col_names[SAMPLE_NAME] = DATA.col_names(SAMPLE_NAME) M = Matrix.InMemoryMatrix(F_X, row_names, col_names) arrayio.pcl_format.write(M, file_layout.FACTOR_SCORES) # Make the heatmap. x = graphlib.find_wide_heatmap_size(M.nrow(), M.ncol(), min_box_height=10, min_box_width=10, max_total_height=768, max_total_width=1024) xpix, ypix = x ypix = min(ypix, xpix * 4) x = graphlib.plot_heatmap(file_layout.FACTOR_SCORES, file_layout.FACTOR_SCORES_PNG, xpix, ypix, color="bild", show_colorbar=True, show_grid=True, gene_center="mean", gene_normalize="var", gene_label=True, cluster_genes=True, array_label=True, cluster_arrays=True, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) # Clean up the cluster files. files = [ file_layout.FACTOR_CDT, file_layout.FACTOR_ATR, file_layout.FACTOR_GTR ] for filename in files: if not os.path.exists(filename): continue src = filename x = os.path.split(filename)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst)
def convert_matrix(filename, header, header_and_platform, in_delim, out_delim, keep_dups, keep_emptys, no_na, out_platforms, min_match_score, debug): import arrayio from genomicode import Matrix from genomicode import arrayplatformlib as apl from genomicode import arrayannot MIN_SCORE = 0.80 REMOVE_VERSION = True assert not (header and header_and_platform) DATA = arrayio.read(filename) if header: x = DATA.row_names(header) gene_ids = apl.normalize_ids(x, delimiter=in_delim, remove_version_number=REMOVE_VERSION) x = apl.score_annotations(gene_ids, min_score=0.5) assert x, "I could not identify the platform for %s." % header best_score = x[0] in_platform, score = best_score.platform_name, best_score.max_score elif header_and_platform: x = header_and_platform.split(",", 1) assert len(x) == 2 header, in_platform = x score = 1.0 x = DATA.row_names(header) gene_ids = apl.normalize_ids(x, delimiter=in_delim, remove_version_number=REMOVE_VERSION) assert apl.find_platform_by_name(in_platform), \ "Unknown platform: %s" % in_platform else: # Take the platform with the highest match score. scores = apl.score_matrix(DATA, annot_delim=in_delim, min_score=None, remove_version=REMOVE_VERSION) best_score = 0 if scores: best_score = scores[0].max_score if best_score < MIN_SCORE and debug and scores: header = ("Header", "Platform", "Score", "Matrix Only", "Plat Only", "Shared", "Matrix Only", "Plat Only", "Shared") print "\t".join(header) for s in scores: x1 = sorted(s.mine_only)[:3] x2 = sorted(s.platform_only)[:3] x3 = sorted(s.shared)[:3] x1 = ", ".join(x1) x2 = ", ".join(x2) x3 = ", ".join(x3) x = (s.header, s.platform_name, s.max_score, len(s.mine_only), len(s.platform_only), len(s.shared), x1, x2, x3) assert len(x) == len(header) print "\t".join(map(str, x)) assert best_score >= MIN_SCORE, "No platforms found" best_score = scores[0] header = best_score.header in_platform = best_score.platform_name score = best_score = best_score.max_score err = "I could not find any platforms. The best was %s (%g)." % ( in_platform, score) assert score >= min_match_score, err gene_ids = DATA.row_names(header) # Convert each of the platforms. output_ids_list = [] for out_platform in out_platforms: x = arrayannot.convert_gene_ids(gene_ids, in_platform, out_platform, in_delim, out_delim, keep_dups, keep_emptys, no_na) output_ids_list.append(x) # Make a matrix with the new IDs. X = DATA._X row_names = DATA._row_names.copy() row_order = DATA._row_order[:] col_names = DATA._col_names.copy() col_order = DATA._col_order[:] synonyms = DATA._synonyms.copy() for (out_platform, output_ids) in zip(out_platforms, output_ids_list): header = out_platform i = 1 while header in row_order: header = "%s_%d" % (out_platform, i) i += 1 row_order.append(header) row_names[header] = output_ids # Write the outfile. x = Matrix.InMemoryMatrix(X, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) arrayio.tab_delimited_format.write(x, sys.stdout)
def read(handle, hrows=None, hcols=None, datatype=float): from genomicode import filelib from genomicode import jmath from genomicode import Matrix import tab_delimited_format as tdf import const handle = filelib.openfh(handle) # Can't use iolib.split_tdf here because it does not handle empty # lines properly (which can occur if there is a file with no # samples). #data = iolib.split_tdf(handle.read()) data = [x.rstrip("\r\n").split("\t") for x in handle] assert len(data) >= 3, "Invalid RES file." # Do some checking on the format. assert len(data[0]) == len(data[1]) + 1 x = sorted([x.upper() for x in data[0][:2]]) assert x == ["ACCESSION", "DESCRIPTION"] assert len(data[2]) == 1, "%d: %s" % (len(data[2]), repr(data[2])) # Parse out the number of genes and delete the row. num_genes = int(data[2][0]) del data[2] assert len(data) == num_genes + 2 # data + 2 headers # GenePattern creates files where the last column is all blank. # If this is the case, then delete it. #blank_last_col = True x = [x[-1] for x in data if x[-1]] if not x: # Last column is all blank so delete it. data = [x[:-1] for x in data] # Parse the names of the samples. sample_names = [] for i, x in enumerate(data[0][2:]): if i % 2: assert not x else: assert x sample_names.append(x) # Parse out the sample_description. sample_description = [] for i, x in enumerate(data[1]): if i % 2 == 0: assert not x else: assert x sample_description.append(x) assert len(sample_description) == len(sample_names) # Pull the scale factors out of the sample_description. # Some of the descriptions can be missing scale factors. scale_factors = [""] * len(sample_description) for i in range(len(sample_description)): x = sample_description[i] sf = "scale factor" j = x.lower().find(sf) if j < 0: continue assert x[j - 1] == "/" assert x[j + len(sf)] == "=" scale_factors[i] = float(sample_description[i][j + len(sf) + 1:]) sample_description[i] = sample_description[i][:j - 1] # Parse out the description and accession columns. accession_header = data[0][0] description_header = data[0][1] accession = [x[0] for x in data[2:]] description = [x[1] for x in data[2:]] x = [x.upper() for x in data[0][:2]] if x == ["DESCRIPTION", "ACCESSION"]: accession_header, description_header = \ description_header, accession_header accession, description = description, accession assert (accession_header.upper(), description_header.upper()) == \ ("ACCESSION", "DESCRIPTION") # Accession should be unique. x = {}.fromkeys(accession).keys() assert len(x) == len(accession) # Parse out the matrix and calls. matrix = [] calls = [] for row in data[2:]: row = row[2:] x0 = [x for (i, x) in enumerate(row) if i % 2 == 0] x1 = [x for (i, x) in enumerate(row) if i % 2 == 1] assert len(x0) == len(x1) for x in x1: assert x.upper() in ["A", "P", "M"], x matrix.append(x0) calls.append(x1) assert len(matrix) == num_genes # Should have some way of specifying no conversion. if datatype is None: convert_fn = None # default elif datatype is int: convert_fn = jmath.safe_int elif datatype is float: convert_fn = jmath.safe_float else: convert_fn = datatype if convert_fn: matrix = [map(convert_fn, x) for x in matrix] row_names = {} col_names = {} row_order = data[0][:2] + ["CALL"] col_order = [tdf.SAMPLE_NAME, "DESCRIPTION", "SCALE_FACTOR"] row_names[accession_header] = accession row_names[description_header] = description # Store the calls as row annotations. The gene annotation "CALL" # is a string of A, P, or M, with one call per sample. row_names["CALL"] = ["".join(x) for x in calls] col_names[tdf.SAMPLE_NAME] = sample_names col_names["DESCRIPTION"] = sample_description col_names["SCALE_FACTOR"] = scale_factors synonyms = {} synonyms[const.COL_ID] = tdf.SAMPLE_NAME synonyms[const.ROW_ID] = accession_header X = Matrix.InMemoryMatrix(matrix, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) #X = Matrix.add_synonyms(X, synonyms) #is_matrix(X); print DIAGNOSIS assert is_matrix(X) return X
def read(handle, hrows=None, hcols=None, datatype=float): import math from genomicode import filelib from genomicode import Matrix from genomicode import jmath from genomicode import iolib import util import const # Format: # - gene x experiment # - optional header row # - optional rows of sample annotations (requires header row) # - optional columns of gene annotations filename = None if type(handle) is type(""): filename = handle handle = filelib.openfh(handle) data = filelib.read_all_cols(handle) #data = [x for x in filelib.read_cols(handle)] #x = handle.read() #data = iolib.split_tdf(x, strip=True) #handle = filelib.read_cols(handle) #data = [handle.next() for i in range(100)] data = _clean_tdf(data) num_cols = len(data[0]) for i, x in enumerate(data): nc = len(data[i]) f = "" if filename: f = " [%s]" % filename error_msg = "Header%s has %d columns but line %d has %d." % ( f, num_cols, i + 1, nc) assert nc == num_cols, error_msg if not data: return Matrix.InMemoryMatrix([]) # If the rows and cols not explicitly specified, then try to guess # them from the file. #print "HEADERS 1", hrows, hcols if hrows is None or hcols is None: hr, hc = util.num_headers(data) if hrows is None: hrows = hr if hcols is None: hcols = hc #print "HEADERS 2", hrows, hcols #num_genes, num_arrays = num_rows-hrows, num_cols-hcols # Pull out the row names from the columns. row_names = {} # header -> list of names (1 for each gene) row_order = [] # in-order list of the headers if hcols: if hrows: # If a header row is provided, then the names of these # annotations are provided in the header. row_order = data[0][:hcols] else: # No header row. Make default name for these annotations. ndigits = int(math.ceil(math.log(hcols, 10))) row_order = ["ANNOT%*d" % (ndigits, i + 1) for i in range(hcols)] # Strip extraneous whitespace from the header names. # Not necessary. Handled now in split_tdf. #row_order = [x.strip() for x in row_order] # Sometimes the format detection can go wrong and a GCT file # will slip through to here. If this occurs, a "duplicate # header" exception will be generated. Check for this and # generate a more meaningful error message. if (row_order[0] == "#1.2" and len(row_order) > 1 and row_order[1] == "" and row_order[-1] == ""): raise AssertionError("ERROR: It looks like a GCT file was missed.") for i, header in enumerate(row_order): names = [x[i] for x in data[hrows:]] assert header not in row_names, "duplicate header: %s" % header row_names[header] = names # Pull out the column names. col_names = {} # header -> list of names (1 for each array) col_order = [] if hrows: for i in range(1, hrows): header = data[i][0] names = data[i][hcols:] assert header not in col_names, "duplicate name: %s" % header # Strip extraneous whitespace from the header names. # Not necessary. Handled now in split_tdf. #header = header.strip() col_order.append(header) col_names[header] = names # Now extract the expression values. matrix = data if hrows or hcols: matrix = [x[hcols:] for x in matrix[hrows:]] # Pull out the sample names. sample_names = None if hrows: # If a header is provided, then use these as the column names. sample_names = data[0][hcols:] if sample_names: col_names[SAMPLE_NAME] = sample_names col_order.insert(0, SAMPLE_NAME) if datatype is None: convert_fn = None # no conversion elif datatype is int: convert_fn = jmath.safe_int elif datatype is float: convert_fn = jmath.safe_float else: # Assume that I was passed a function. convert_fn = datatype if convert_fn == jmath.safe_float: # Try and convert to an integer instead. is_int = True for i in range(len(matrix)): for j in range(len(matrix[i])): if not jmath.is_int(matrix[i][j]): is_int = False break if not is_int: break if is_int: convert_fn = jmath.safe_int if convert_fn: check_each_row = False try: matrix = [map(convert_fn, x) for x in matrix] except ValueError, err1: if str(err1) == "empty string for float()": check_each_row = True elif str(err1).startswith("invalid literal for float()"): check_each_row = True elif str(err1).startswith("could not convert string to float"): check_each_row = True else: raise if check_each_row: # If there was an exception, then check each row carefully # to try to pinpoint the problem. for i, x in enumerate(matrix): try: map(convert_fn, x) except ValueError, err2: row = data[hrows + i] raise ValueError("%s\nProblem with row %d: %s" % (str(err2), i + 1, row)) raise AssertionError("Error converting values.")
def read(handle, datatype=None): from genomicode import filefns from genomicode import Matrix import const # datatype is not used here. It is explicitly specified in the # format. handle = filefns.openfh(handle) # Read the header. The format description doesn't specify whether # the names are case sensitive, so accept case insensitive names # by converting everything to uppercase. x = handle.readline().strip() assert x == "ODF 1.0", "Missing ODF version." x = handle.readline().strip().split("=") assert len(x) == 2 assert x[0].upper() == "HEADERLINES" header_lines = int(x[1]) assert header_lines >= 3 and header_lines <= 7, \ "Invalid number of header lines." lines = [handle.readline() for i in range(header_lines)] lines = [x for x in lines if x] # remove blank lines if EOF assert len(lines) == header_lines, "Wrong number of lines in header." # Parse the header lines. header = {} # name -> value num_cols = None # just the data, not the annotation headers. for line in lines: delimiter = "=" if line.startswith("COLUMN"): delimiter = ":" assert delimiter in line, "Header missing delimiter '%s': %s" % ( delimiter, line) name, value = line.split(delimiter) name, value = name.strip(" \r\n"), value.strip(" \r\n") if name.startswith("COLUMN"): value = value.split("\t") num_data = len(value) if name in ["COLUMN_TYPES", "COLUMN_NAMES"]: # Contains metadata describing the annotations. num_data = len(value)-2 if num_cols is None: num_cols = num_data assert num_data == num_cols name = name.upper() header[name] = value header["DATALINES"] = int(header["DATALINES"]) assert "MODEL" in header, 'Missing "Model" header.' assert "DATALINES" in header, 'Missing "DataLines" header.' assert "COLUMN_TYPES" in header, 'Missing "COLUMN_TYPES" header.' assert num_cols is not None # Should come from COLUMN_TYPES. assert header["DATALINES"] >= 0 and header["DATALINES"] < 1E6 # Read the data block. lines = [handle.readline() for i in range(header["DATALINES"])] lines = [x for x in lines if not x.startswith("#")] # no comments data = [x.rstrip("\r\n").split("\t") for x in lines] # There might be leftover information in the file. The format # does not describe how to handle this case. # Parse the column names out of the header. col_types = header["COLUMN_TYPES"] # required col_names = header.get("COLUMN_NAMES") # optional col_descriptions = header.get("COLUMN_DESCRIPTIONS") # optional # Make sure each line has the right number of columns. for x in data: assert len(x) == num_cols+2 # add the 2 columns of annotations # Convert the types of the data. for j in range(num_cols): coltype = col_types[j] ucoltype = coltype.upper() convert_fn = None if ucoltype == "STRING": pass elif ucoltype == "FLOAT": convert_fn = float else: raise AssertionError, "Unknown column type: %s" % coltype if not convert_fn: continue for i in range(len(data)): data[i][j] = convert_fn(data[i][j]) # The first two columns are for the row names and description. col0 = [x[0] for x in data] col1 = [x[1] for x in data] head0, head1 = "Name", "Description" if col_names: head0, head1 = col_names[0], col_names[1] row_names, row_descriptions = col0, col1 row_name_header, row_description_header = head0, head1 if "ROWNAMESCOLUMN" in header: col = int(header["ROWNAMESCOLUMN"]) assert col in [0, 1] if col == 1: row_names = col1 row_name_header = head1 if "ROWDESCRIPTIONSCOLUMN" in header: col = int(header["ROWDESCRIPTIONSCOLUMN"]) assert col in [0, 1] if col == 0: row_descriptions = col0 row_description_header = head0 # Cut off the headers for the annotations. col_types = col_types[2:] if col_names: col_names = col_names[2:] matrix = [x[2:] for x in data] row_headers = [head0, head1] col_headers = None row_annots = {} col_annots = {} synonyms = {} row_annots[row_name_header] = row_names row_annots[row_description_header] = row_descriptions col_annots["TYPE"] = col_types if col_descriptions: col_annots["DESCRIPTION"] = col_descriptions synonyms[const.ROW_ID] = row_name_header X = Matrix.InMemoryMatrix( matrix, row_names=None, col_names=col_names, row_headers=row_headers, col_headers=col_headers, row_annots=row_annots, col_annots=col_annots, synonyms=synonyms) assert is_matrix(X) return X
def clean_model(model, factor_cutoff=None): # Process the model for simpler handling. Return a dictionary of # the cleaned up model. The dictionary contains: # A pxk # Bnz pxk # PostPib pxk # factors pxk 1/0, based on factor_cutoff # ExternalProb mxk Parallel to data set. (May be missing.) # F kxn # Psi p # Tau k # VariablesIn p Indexes (0-based) of genes, relative to data set. # # GENE_O p For sorting original model to this order. 0-based # FACTOR_O k For sorting original model to this order. 0-based # # p num_genes_in_model # n num_samples # k num_factors Just latent factors. # m num_genes_in_dataset # # Matrix variables, A, F, PostPib, factors, VariablesIn, # etc. are all sorted according to FACTOR_O and GENE_O. FACTOR_O # and GENE_O are provided to help convert the original data set to # the same order as these variables. # # Changes from the original model: # o Remove the designs and controls. # o Make VariablesIn, if it doesn't exist. # o Make VariablesIn 0-based, instead of 1-based. # o Sort the genes and factors. from genomicode import jmath from genomicode import Matrix factor_cutoff = factor_cutoff or 0.99 # Figure out the dimensions of this model. p = len(model["Psi"]) n = model["NObservations"] q = len(model["Tau"]) m = model["NVariables"] k = q - model["NControlVariables"] - model["NDesigns"] # Get rid of the design variables. # Boundary case: k == 0, otherwise won't slice correctly. if k > 0: A = model["A"].matrix(None, (-k, None)) Bnz = model["Bnz"].matrix(None, (-k, None)) PostPib = model["PostPib"].matrix(None, (-k, None)) F = model["F"].matrix((-k, None), None) Tau = model["Tau"][-k:] else: A = model["A"].matrix(None, []) Bnz = model["Bnz"].matrix(None, []) PostPib = model["PostPib"].matrix(None, []) F = model["F"].matrix([], None) Tau = [] # Make the factors variable by thresholding PostPib on factor_cutoff. factors = PostPib.matrix() for x in factors._X: for i in range(len(x)): if x[i] >= factor_cutoff: x[i] = 1 else: x[i] = 0 # If VariablesIn doesn't exist, make it. VariablesIn = model.get("VariablesIn") if not VariablesIn: # First, make a 1-based index. VariablesIn = list(range(1, p+1)) # Convert VariablesIn to 0-based index. VariablesIn = [x-1 for x in VariablesIn] # Make the ExternalProb variable. ExternalProb = None if "ExternalProb" in model: # First column is the (1-based) index of the genes. These # should not overlap with VariablesIn. seen = {}.fromkeys(VariablesIn) for index in model["ExternalProb"].value(None, 0): index = int(index)-1 assert index not in seen seen[index] = 1 assert len(seen) == m prob = [[0]*k for i in range(m)] # Set the probabilities from PostPib. for i, index in enumerate(VariablesIn): prob[index] = PostPib.value(i, None) for x in model["ExternalProb"]._X: index, x = int(x[0])-1, x[1:] assert len(x) == k prob[index] = x #for index in model["mVariablesIn"]: # print "%d\tmVariablesIn" % index #for index in model["mExternalProb"].value(None, 0): # index = int(index) # print "%d\tmExternalProb" % index #sys.exit(0) ExternalProb = Matrix.InMemoryMatrix(prob) # Order the factors based on decreasing number of genes. if factors._X: sums = jmath.mysum(factors._X, byrow=0) FACTOR_O = jmath.order(sums, decreasing=1) else: # No factors. FACTOR_O = [] # Order the genes based on decreasing number of factors. Earlier # factors should get much higher weights. X = factors.slice(None, FACTOR_O) if X: weights = [2**x for x in reversed(range(k))] sums = [None] * p for i in range(p): sums[i] = sum([x1*x2 for (x1, x2) in zip(X[i], weights)]) GENE_O = jmath.order(sums, decreasing=1) else: # No factors. # Can't specify genes, or the indexes will be out of range of # an empty matrix. #GENE_O = list(range(p)) GENE_O = [] cmod = {} cmod["A"] = A.matrix(GENE_O, FACTOR_O) cmod["Bnz"] = Bnz.matrix(GENE_O, FACTOR_O) cmod["PostPib"] = PostPib.matrix(GENE_O, FACTOR_O) cmod["factors"] = factors.matrix(GENE_O, FACTOR_O) if ExternalProb: cmod["ExternalProb"] = ExternalProb.matrix(None, FACTOR_O) cmod["F"] = F.matrix(FACTOR_O, None) cmod["Psi"] = [model["Psi"][i] for i in GENE_O] cmod["Tau"] = [Tau[i] for i in FACTOR_O] cmod["VariablesIn"] = [VariablesIn[i] for i in GENE_O] cmod["GENE_O"] = GENE_O cmod["FACTOR_O"] = FACTOR_O # Check the dimensions of the matrices. if k: # If no factors, then the number of rows of the matrices won't # be preserved. assert cmod["A"].dim() == (p, k), "%s %s" % (A.dim(), (p, k)) assert cmod["Bnz"].dim() == (p, k) assert cmod["PostPib"].dim() == (p, k) assert cmod["factors"].dim() == (p, k) if "ExternalProb" in cmod: assert cmod["ExternalProb"].dim() == (m, k) assert cmod["F"].dim() == (k, n) # Length of all these will be 0. assert len(cmod["Psi"]) == p assert len(cmod["Tau"]) == k assert len(cmod["VariablesIn"]) == p assert len(cmod["GENE_O"]) == p assert len(cmod["FACTOR_O"]) == k return cmod
row = data[hrows + i] raise ValueError("%s\nProblem with row %d: %s" % (str(err2), i + 1, row)) raise AssertionError("Error converting values.") # Set ROW_ID and COL_ID to reasonable defaults. synonyms = {} if SAMPLE_NAME in col_names: synonyms[const.COL_ID] = SAMPLE_NAME if row_order: # Bug: This should be the first column with unique values. synonyms[const.ROW_ID] = row_order[0] X = Matrix.InMemoryMatrix(matrix, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) #X = Matrix.add_synonyms(X, synonyms) return X CLEAN_RE = None CLEAN_DISALLOWED = None def _clean(s, disallowed=None): # Make sure there are no disallowed characters in the string s. global CLEAN_RE global CLEAN_DISALLOWED