Example #1
0
def test_download(my_expression_file, my_gene_ontology_file,
                  my_fly_gene_set_file):
    """Test if required data files were downloaded successfully."""

    # expression file
    print(my_expression_file)
    assert os.path.isfile(my_expression_file)
    matrix = ExpMatrix.read_tsv(my_expression_file)
    assert isinstance(matrix, ExpMatrix)
    assert matrix.hash == 'aa7cc5e6e04d34e65058f059bcdfe5ea'

    # gene ontology file
    print(my_gene_ontology_file)
    assert os.path.isfile(my_gene_ontology_file)
    # hash not stable?
    #ontology = GeneOntology.read_obo(my_gene_ontology_file)
    #assert isinstance(ontology, GeneOntology)
    #assert ontology.hash == '978546899cfb0196ac2005d4b177725f'

    # gene set file
    print(my_fly_gene_set_file)
    assert os.path.isfile(my_fly_gene_set_file)
    gene_sets = GeneSetCollection.read_tsv(my_fly_gene_set_file)
    assert isinstance(gene_sets, GeneSetCollection)
    assert gene_sets.hash == '78b4b27e9658560a8e5993154d3228fa'
Example #2
0
def plot_read_count_distribution(barcode_count_file,
                                 output_file,
                                 xaxis_label=('# mapped reads '
                                              '(log<sub>10</sub>-scale)')):
    """Plot histogram of the distribution of reads per barcode.
    
    TODO: docstring"""
    matrix = ExpMatrix.read_tsv(barcode_count_file)

    x = np.float64(matrix.values.ravel())
    num_total_reads = int(np.sum(x))
    x[x < 1] = 1
    x = np.log10(x)

    data = [go.Histogram(x=x, nbinsx=100)]

    layout = go.Layout(
        title='Total number of mapped reads: %d' % num_total_reads,
        font=dict(
            size=20,
            family='serif',
        ),
        xaxis=dict(title=xaxis_label, ),
        yaxis=dict(
            title='# barcodes',
            type='log',
        ),
    )

    fig = go.Figure(data=data, layout=layout)
    plot(fig, filename=output_file, show_link=False, auto_open=False)
Example #3
0
def test_download(my_expression_file, my_gene_ontology_file,
                  my_fly_gene_set_file):
    """Test if required data files were downloaded successfully."""

    # expression file
    print(my_expression_file)
    assert os.path.isfile(my_expression_file)
    matrix = ExpMatrix.read_tsv(my_expression_file)
    assert isinstance(matrix, ExpMatrix)
    assert matrix.hash == 'aa7cc5e6e04d34e65058f059bcdfe5ea'

    # gene ontology file
    print(my_gene_ontology_file)
    assert os.path.isfile(my_gene_ontology_file)
    # hash not stable?
    #ontology = GeneOntology.read_obo(my_gene_ontology_file)
    #assert isinstance(ontology, GeneOntology)
    #assert ontology.hash == '978546899cfb0196ac2005d4b177725f'

    # gene set file
    print(my_fly_gene_set_file)
    assert os.path.isfile(my_fly_gene_set_file)
    gene_sets = GeneSetCollection.read_tsv(my_fly_gene_set_file)
    assert isinstance(gene_sets, GeneSetCollection)
    assert gene_sets.hash == '78b4b27e9658560a8e5993154d3228fa'
Example #4
0
def test_tsv(tmpdir, my_matrix):
    output_file = tmpdir.join('expression_matrix.tsv').strpath
    my_matrix.write_tsv(output_file)
    # data = open(str(path), mode='rb').read()
    # h = hashlib.md5(data).hexdigest()
    # assert h == 'd34bf3d376eb613e4fea894f7c9d601f'
    other = ExpMatrix.read_tsv(output_file)
    assert other is not my_matrix
    assert other == my_matrix
Example #5
0
def test_tsv(tmpdir, my_matrix):
    output_file = tmpdir.join('expression_matrix.tsv').strpath
    my_matrix.write_tsv(output_file)
    # data = open(str(path), mode='rb').read()
    # h = hashlib.md5(data).hexdigest()
    # assert h == 'd34bf3d376eb613e4fea894f7c9d601f'
    other = ExpMatrix.read_tsv(output_file)
    assert other is not my_matrix
    assert other == my_matrix
Example #6
0
def main(args=None):
    """Run GO-PCA and store the result in a `pickle` file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
 
    Raises
    ------
    SystemError
        If the version of the Python interpreter is not >= 2.7.
    """
    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        # read arguments from the command line
        parser = get_argument_parser()

        # parse first with default options, in case "--help" is specified
        # ("--help" causes the program to exit at this point)
        args = parser.parse_args()

        # now remove the defaults and parse again
        # (removing the defaults is important so that we know which values
        # were specified by the user)
        no_defaults = dict([p, None] for p in GOPCA.get_param_defaults())
        no_defaults2 = dict([p, None] for p in GOPCAParams.get_param_defaults())
        no_defaults.update(no_defaults2)
        parser.set_defaults(**no_defaults)
        args = parser.parse_args()

    # reporting options
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # test if we can write to log_file?

    # configure root logger
    logger = util.get_logger(log_file=log_file, quiet=quiet)

    # check if required parameters were specified
    passed = True
    if args.expression_file is None:
        logger.error('No expression file specified!')
        passed = False
    if args.gene_set_file is None:
        logger.error('No gene set file specified!')
        passed = False
    if args.output_file is None:
        logger.error('No output file specified!')
        passed = False
    if not passed:
        logger.error('Not all required parameters were specified.')
        return 1

    # generate configuration
    if args.config_file is not None:
        # read parameter values from config file
        params = GOPCAParams.read_ini(args.config_file)
    else:
        # start with default configuration
        params = GOPCAParams()

    # overwrite parameters specified on the command line
    for p in GOPCAParams.get_param_defaults():
        v = getattr(args, p)
        if v is not None:
            logger.debug('Parameter "%s" specified on command line!', p)
            params.set_param(p, v)

    global_params = GOPCA.get_param_defaults()
    for k in list(global_params.keys()):
        v = getattr(args, k)
        if v is not None:
            logger.debug('Parameter "%s" specified on command line!', p)
            global_params[k] = v

    # read expression file
    matrix = ExpMatrix.read_tsv(args.expression_file)
    logger.info('Expression matrix size: ' +
                '(p = %d genes) x (n = %d samples).', matrix.p, matrix.n)

    if args.sel_var_genes > 0:
        # filter genes by variance
        matrix = matrix.filter_variance(args.sel_var_genes)
    
    # read gene set file
    gene_sets = GeneSetCollection.read_tsv(args.gene_set_file)
    print(args.gene_set_file, gene_sets)
    
    # read ontology file (if supplied)
    gene_ontology = None
    if args.gene_ontology_file is not None:
        p_logger = logging.getLogger(genometools.__name__)
        p_logger.setLevel(logging.ERROR)
        gene_ontology = GeneOntology.read_obo(
            args.gene_ontology_file,
            part_of_cc_only=params.go_part_of_cc_only)
        p_logger.setLevel(logging.NOTSET)
        
    M = GOPCA.simple_setup(matrix, params, gene_sets, gene_ontology,
                          verbose=verbose, **global_params)
    run = M.run()

    if run is None:
        logger.error('GO-PCA run failed!')
        return 1

    # write run to pickle file
    logger.info('Storing GO-PCA run in file "%s"...', args.output_file)
    run.write_pickle(args.output_file)

    return 0
Example #7
0
def my_matrix_filtered(my_expression_file):
    matrix = ExpMatrix.read_tsv(my_expression_file)
    matrix_filtered = filter_variance(matrix, 8000)
    return matrix_filtered
Example #8
0
def my_matrix_filtered(my_expression_file):
    matrix = ExpMatrix.read_tsv(my_expression_file)
    matrix_filtered = filter_variance(matrix, 8000)
    return matrix_filtered
Example #9
0
def main(args=None):
    """Run GO-PCA and store the result in a `pickle` file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
 
    Raises
    ------
    SystemError
        If the version of the Python interpreter is not >= 2.7.
    """
    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        # read arguments from the command line
        parser = get_argument_parser()

        # parse first with default options, in case "--help" is specified
        # ("--help" causes the program to exit at this point)
        args = parser.parse_args()

        # now remove the defaults and parse again
        # (removing the defaults is important so that we know which values
        # were specified by the user)
        no_defaults = dict([p, None] for p in GOPCA.get_param_defaults())
        no_defaults2 = dict([p, None]
                            for p in GOPCAParams.get_param_defaults())
        no_defaults.update(no_defaults2)
        parser.set_defaults(**no_defaults)
        args = parser.parse_args()

    # reporting options
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # test if we can write to log_file?

    # configure root logger
    logger = util.get_logger(log_file=log_file, quiet=quiet)

    # check if required parameters were specified
    passed = True
    if args.expression_file is None:
        logger.error('No expression file specified!')
        passed = False
    if args.gene_set_file is None:
        logger.error('No gene set file specified!')
        passed = False
    if args.output_file is None:
        logger.error('No output file specified!')
        passed = False
    if not passed:
        logger.error('Not all required parameters were specified.')
        return 1

    # generate configuration
    if args.config_file is not None:
        # read parameter values from config file
        params = GOPCAParams.read_ini(args.config_file)
    else:
        # start with default configuration
        params = GOPCAParams()

    # overwrite parameters specified on the command line
    for p in GOPCAParams.get_param_defaults():
        v = getattr(args, p)
        if v is not None:
            logger.debug('Parameter "%s" specified on command line!', p)
            params.set_param(p, v)

    global_params = GOPCA.get_param_defaults()
    for k in list(global_params.keys()):
        v = getattr(args, k)
        if v is not None:
            logger.debug('Parameter "%s" specified on command line!', p)
            global_params[k] = v

    # read expression file
    matrix = ExpMatrix.read_tsv(args.expression_file)
    logger.info(
        'Expression matrix size: ' + '(p = %d genes) x (n = %d samples).',
        matrix.p, matrix.n)

    if args.sel_var_genes > 0:
        # filter genes by variance
        matrix = matrix.filter_variance(args.sel_var_genes)

    # read gene set file
    gene_sets = GeneSetCollection.read_tsv(args.gene_set_file)
    print(args.gene_set_file, gene_sets)

    # read ontology file (if supplied)
    gene_ontology = None
    if args.gene_ontology_file is not None:
        p_logger = logging.getLogger(genometools.__name__)
        p_logger.setLevel(logging.ERROR)
        gene_ontology = GeneOntology.read_obo(
            args.gene_ontology_file, part_of_cc_only=params.go_part_of_cc_only)
        p_logger.setLevel(logging.NOTSET)

    M = GOPCA.simple_setup(matrix,
                           params,
                           gene_sets,
                           gene_ontology,
                           verbose=verbose,
                           **global_params)
    run = M.run()

    if run is None:
        logger.error('GO-PCA run failed!')
        return 1

    # write run to pickle file
    logger.info('Storing GO-PCA run in file "%s"...', args.output_file)
    run.write_pickle(args.output_file)

    return 0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError(
            "Python interpreter version >= 2.7 required, " "found %d.%d instead." % (vinfo.major, vinfo.minor)
        )

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    expression_file = args.expression_file
    entrez2gene_file = args.entrez2gene_file
    gene_file = args.gene_file
    output_file = args.output_file

    strip_affy_suffix = args.strip_affy_suffix

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read data
    genome = ExpGenome.read_tsv(gene_file)
    matrix = ExpMatrix.read_tsv(expression_file)
    e2g = dict(misc.read_all(entrez2gene_file))

    entrez = matrix.genes

    if strip_affy_suffix:
        # remove "_at" suffix from Entrez IDs
        entrez = [e[:-3] for e in entrez]
    logger.debug(str(entrez[:3]))

    # check that Entrez IDs are unique
    assert len(entrez) == len(set(entrez))

    # convert Entrez IDs to gene names
    f = 0
    genes = []
    X = []

    # g = None
    for i, e in enumerate(entrez):
        # print e
        try:
            g = e2g[e]
        except KeyError:
            f += 1
        else:
            # check if there are multiple entrez IDs pointing to the same gene
            # assert g not in genes
            genes.append(g)
            X.append(matrix.X[i, :])
    assert len(genes) == len(set(genes))
    if f > 0:
        logger.warning(
            "Failed to convert %d / %d entrez IDs " "to gene symbols (%.1f%%).",
            f,
            matrix.p,
            100 * (f / float(matrix.p)),
        )

    # filter for known protein-coding genes
    X = np.float64(X)
    p = X.shape[0]
    logger.debug(str(X.shape))
    sel = np.zeros(p, dtype=np.bool_)
    for i in range(p):
        if genes[i] in genome:
            sel[i] = True
    sel = np.nonzero(sel)[0]
    genes = [genes[i] for i in sel]
    X = X[sel, :]
    f = p - sel.size
    if f > 0:
        logger.warning(
            "Failed to find %d / %d gene symbols in list of " "protein-coding genes (%.1f%%)",
            f,
            p,
            100 * (f / float(p)),
        )

    # generate new matrix (this automatically sorts the genes alphabetically)
    logger.debug("Genes: %d, Samples: %d, matrix: %s", len(genes), len(matrix.samples), str(X.shape))
    matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X)

    # write output file
    matrix_conv.write_tsv(output_file)

    return 0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    expression_file = args.expression_file
    entrez2gene_file = args.entrez2gene_file
    gene_file = args.gene_file
    output_file = args.output_file

    strip_affy_suffix = args.strip_affy_suffix

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read data
    genome = ExpGeneTable.read_tsv(gene_file)
    matrix = ExpMatrix.read_tsv(expression_file)
    e2g = dict(misc.read_all(entrez2gene_file))

    entrez = matrix.genes

    if strip_affy_suffix:
        # remove "_at" suffix from Entrez IDs
        entrez = [e[:-3] for e in entrez]
    logger.debug(str(entrez[:3]))

    # check that Entrez IDs are unique
    assert len(entrez) == len(set(entrez))

    # convert Entrez IDs to gene names
    f = 0
    genes = []
    X = []

    # g = None
    for i, e in enumerate(entrez):
        # print e
        try:
            g = e2g[e]
        except KeyError:
            f += 1
        else:
            # check if there are multiple entrez IDs pointing to the same gene
            # assert g not in genes
            genes.append(g)
            X.append(matrix.X[i, :])
    assert len(genes) == len(set(genes))
    if f > 0:
        logger.warning(
            'Failed to convert %d / %d entrez IDs '
            'to gene symbols (%.1f%%).', f, matrix.p,
            100 * (f / float(matrix.p)))

    # filter for known protein-coding genes
    X = np.float64(X)
    p = X.shape[0]
    logger.debug(str(X.shape))
    sel = np.zeros(p, dtype=np.bool_)
    for i in range(p):
        if genes[i] in genome:
            sel[i] = True
    sel = np.nonzero(sel)[0]
    genes = [genes[i] for i in sel]
    X = X[sel, :]
    f = p - sel.size
    if f > 0:
        logger.warning(
            'Failed to find %d / %d gene symbols in list of '
            'protein-coding genes (%.1f%%)', f, p, 100 * (f / float(p)))

    # generate new matrix (this automatically sorts the genes alphabetically)
    logger.debug('Genes: %d, Samples: %d, matrix: %s', len(genes),
                 len(matrix.samples), str(X.shape))
    matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X)

    # write output file
    matrix_conv.write_tsv(output_file)

    return 0