def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() gopca_file = args.gopca_file output_file = args.output_file #sig_max_len = args.sig_max_len #sig_reverse_order = args.sig_reverse_order #sample_cluster_metric = args.sample_cluster_metric #no_sample_clustering = args.no_sample_clustering # configure root logger log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) result = util.read_gopca_result(gopca_file) sig_matrix = util.read_gopca_result(gopca_file) sig_labels = [sig.get_label(include_id=False) for sig in sig_matrix.signatures] matrix = ExpMatrix(genes=sig_labels, samples=sig_matrix.samples, X=sig_matrix.X) matrix.index.name = 'Signatures' #signatures = result.signatures #sig_labels = [sig.get_label(max_name_length=sig_max_len, include_id=False) # for sig in signatures] #samples = list(result.samples) # generate expression matrix #E = ExpMatrix(genes=sig_labels, samples=samples, X=sig_matrix.X) # clustering of signatures (rows) #E, _ = cluster.cluster_genes(E, reverse=sig_reverse_order) exp_logger = logging.getLogger(expression.__name__) exp_logger.setLevel(logging.WARNING) matrix.write_tsv(output_file) exp_logger.setLevel(logging.NOTSET) logger.info('Wrote %d x %d signature matrix to "%s".', matrix.p, matrix.n, output_file) return 0
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError( "Python interpreter version >= 2.7 required, " "found %d.%d instead." % (vinfo.major, vinfo.minor) ) if args is None: parser = get_argument_parser() args = parser.parse_args() expression_file = args.expression_file entrez2gene_file = args.entrez2gene_file gene_file = args.gene_file output_file = args.output_file strip_affy_suffix = args.strip_affy_suffix log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read data genome = ExpGenome.read_tsv(gene_file) matrix = ExpMatrix.read_tsv(expression_file) e2g = dict(misc.read_all(entrez2gene_file)) entrez = matrix.genes if strip_affy_suffix: # remove "_at" suffix from Entrez IDs entrez = [e[:-3] for e in entrez] logger.debug(str(entrez[:3])) # check that Entrez IDs are unique assert len(entrez) == len(set(entrez)) # convert Entrez IDs to gene names f = 0 genes = [] X = [] # g = None for i, e in enumerate(entrez): # print e try: g = e2g[e] except KeyError: f += 1 else: # check if there are multiple entrez IDs pointing to the same gene # assert g not in genes genes.append(g) X.append(matrix.X[i, :]) assert len(genes) == len(set(genes)) if f > 0: logger.warning( "Failed to convert %d / %d entrez IDs " "to gene symbols (%.1f%%).", f, matrix.p, 100 * (f / float(matrix.p)), ) # filter for known protein-coding genes X = np.float64(X) p = X.shape[0] logger.debug(str(X.shape)) sel = np.zeros(p, dtype=np.bool_) for i in range(p): if genes[i] in genome: sel[i] = True sel = np.nonzero(sel)[0] genes = [genes[i] for i in sel] X = X[sel, :] f = p - sel.size if f > 0: logger.warning( "Failed to find %d / %d gene symbols in list of " "protein-coding genes (%.1f%%)", f, p, 100 * (f / float(p)), ) # generate new matrix (this automatically sorts the genes alphabetically) logger.debug("Genes: %d, Samples: %d, matrix: %s", len(genes), len(matrix.samples), str(X.shape)) matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X) # write output file matrix_conv.write_tsv(output_file) return 0
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() expression_file = args.expression_file entrez2gene_file = args.entrez2gene_file gene_file = args.gene_file output_file = args.output_file strip_affy_suffix = args.strip_affy_suffix log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read data genome = ExpGeneTable.read_tsv(gene_file) matrix = ExpMatrix.read_tsv(expression_file) e2g = dict(misc.read_all(entrez2gene_file)) entrez = matrix.genes if strip_affy_suffix: # remove "_at" suffix from Entrez IDs entrez = [e[:-3] for e in entrez] logger.debug(str(entrez[:3])) # check that Entrez IDs are unique assert len(entrez) == len(set(entrez)) # convert Entrez IDs to gene names f = 0 genes = [] X = [] # g = None for i, e in enumerate(entrez): # print e try: g = e2g[e] except KeyError: f += 1 else: # check if there are multiple entrez IDs pointing to the same gene # assert g not in genes genes.append(g) X.append(matrix.X[i, :]) assert len(genes) == len(set(genes)) if f > 0: logger.warning( 'Failed to convert %d / %d entrez IDs ' 'to gene symbols (%.1f%%).', f, matrix.p, 100 * (f / float(matrix.p))) # filter for known protein-coding genes X = np.float64(X) p = X.shape[0] logger.debug(str(X.shape)) sel = np.zeros(p, dtype=np.bool_) for i in range(p): if genes[i] in genome: sel[i] = True sel = np.nonzero(sel)[0] genes = [genes[i] for i in sel] X = X[sel, :] f = p - sel.size if f > 0: logger.warning( 'Failed to find %d / %d gene symbols in list of ' 'protein-coding genes (%.1f%%)', f, p, 100 * (f / float(p))) # generate new matrix (this automatically sorts the genes alphabetically) logger.debug('Genes: %d, Samples: %d, matrix: %s', len(genes), len(matrix.samples), str(X.shape)) matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X) # write output file matrix_conv.write_tsv(output_file) return 0