Example #1
0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    gopca_file = args.gopca_file
    output_file = args.output_file

    #sig_max_len = args.sig_max_len
    #sig_reverse_order = args.sig_reverse_order

    #sample_cluster_metric = args.sample_cluster_metric
    #no_sample_clustering = args.no_sample_clustering

    # configure root logger
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose
    logger = misc.get_logger(log_file=log_file, quiet=quiet,
                             verbose=verbose)

    result = util.read_gopca_result(gopca_file)
    
    sig_matrix = util.read_gopca_result(gopca_file)

    sig_labels = [sig.get_label(include_id=False)
                  for sig in sig_matrix.signatures]

    matrix = ExpMatrix(genes=sig_labels, samples=sig_matrix.samples,
                       X=sig_matrix.X)
    matrix.index.name = 'Signatures'
    #signatures = result.signatures
    #sig_labels = [sig.get_label(max_name_length=sig_max_len, include_id=False)
    #              for sig in signatures]
    #samples = list(result.samples)

    # generate expression matrix
    #E = ExpMatrix(genes=sig_labels, samples=samples, X=sig_matrix.X)

    # clustering of signatures (rows)
    #E, _ = cluster.cluster_genes(E, reverse=sig_reverse_order)

    exp_logger = logging.getLogger(expression.__name__)
    exp_logger.setLevel(logging.WARNING)
    matrix.write_tsv(output_file)
    exp_logger.setLevel(logging.NOTSET)
    logger.info('Wrote %d x %d signature matrix to "%s".',
                matrix.p, matrix.n, output_file)

    return 0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError(
            "Python interpreter version >= 2.7 required, " "found %d.%d instead." % (vinfo.major, vinfo.minor)
        )

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    expression_file = args.expression_file
    entrez2gene_file = args.entrez2gene_file
    gene_file = args.gene_file
    output_file = args.output_file

    strip_affy_suffix = args.strip_affy_suffix

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read data
    genome = ExpGenome.read_tsv(gene_file)
    matrix = ExpMatrix.read_tsv(expression_file)
    e2g = dict(misc.read_all(entrez2gene_file))

    entrez = matrix.genes

    if strip_affy_suffix:
        # remove "_at" suffix from Entrez IDs
        entrez = [e[:-3] for e in entrez]
    logger.debug(str(entrez[:3]))

    # check that Entrez IDs are unique
    assert len(entrez) == len(set(entrez))

    # convert Entrez IDs to gene names
    f = 0
    genes = []
    X = []

    # g = None
    for i, e in enumerate(entrez):
        # print e
        try:
            g = e2g[e]
        except KeyError:
            f += 1
        else:
            # check if there are multiple entrez IDs pointing to the same gene
            # assert g not in genes
            genes.append(g)
            X.append(matrix.X[i, :])
    assert len(genes) == len(set(genes))
    if f > 0:
        logger.warning(
            "Failed to convert %d / %d entrez IDs " "to gene symbols (%.1f%%).",
            f,
            matrix.p,
            100 * (f / float(matrix.p)),
        )

    # filter for known protein-coding genes
    X = np.float64(X)
    p = X.shape[0]
    logger.debug(str(X.shape))
    sel = np.zeros(p, dtype=np.bool_)
    for i in range(p):
        if genes[i] in genome:
            sel[i] = True
    sel = np.nonzero(sel)[0]
    genes = [genes[i] for i in sel]
    X = X[sel, :]
    f = p - sel.size
    if f > 0:
        logger.warning(
            "Failed to find %d / %d gene symbols in list of " "protein-coding genes (%.1f%%)",
            f,
            p,
            100 * (f / float(p)),
        )

    # generate new matrix (this automatically sorts the genes alphabetically)
    logger.debug("Genes: %d, Samples: %d, matrix: %s", len(genes), len(matrix.samples), str(X.shape))
    matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X)

    # write output file
    matrix_conv.write_tsv(output_file)

    return 0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    expression_file = args.expression_file
    entrez2gene_file = args.entrez2gene_file
    gene_file = args.gene_file
    output_file = args.output_file

    strip_affy_suffix = args.strip_affy_suffix

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read data
    genome = ExpGeneTable.read_tsv(gene_file)
    matrix = ExpMatrix.read_tsv(expression_file)
    e2g = dict(misc.read_all(entrez2gene_file))

    entrez = matrix.genes

    if strip_affy_suffix:
        # remove "_at" suffix from Entrez IDs
        entrez = [e[:-3] for e in entrez]
    logger.debug(str(entrez[:3]))

    # check that Entrez IDs are unique
    assert len(entrez) == len(set(entrez))

    # convert Entrez IDs to gene names
    f = 0
    genes = []
    X = []

    # g = None
    for i, e in enumerate(entrez):
        # print e
        try:
            g = e2g[e]
        except KeyError:
            f += 1
        else:
            # check if there are multiple entrez IDs pointing to the same gene
            # assert g not in genes
            genes.append(g)
            X.append(matrix.X[i, :])
    assert len(genes) == len(set(genes))
    if f > 0:
        logger.warning(
            'Failed to convert %d / %d entrez IDs '
            'to gene symbols (%.1f%%).', f, matrix.p,
            100 * (f / float(matrix.p)))

    # filter for known protein-coding genes
    X = np.float64(X)
    p = X.shape[0]
    logger.debug(str(X.shape))
    sel = np.zeros(p, dtype=np.bool_)
    for i in range(p):
        if genes[i] in genome:
            sel[i] = True
    sel = np.nonzero(sel)[0]
    genes = [genes[i] for i in sel]
    X = X[sel, :]
    f = p - sel.size
    if f > 0:
        logger.warning(
            'Failed to find %d / %d gene symbols in list of '
            'protein-coding genes (%.1f%%)', f, p, 100 * (f / float(p)))

    # generate new matrix (this automatically sorts the genes alphabetically)
    logger.debug('Genes: %d, Samples: %d, matrix: %s', len(genes),
                 len(matrix.samples), str(X.shape))
    matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X)

    # write output file
    matrix_conv.write_tsv(output_file)

    return 0