Example #1
0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    gopca_file = args.gopca_file
    output_file = args.output_file

    #sig_max_len = args.sig_max_len
    #sig_reverse_order = args.sig_reverse_order

    #sample_cluster_metric = args.sample_cluster_metric
    #no_sample_clustering = args.no_sample_clustering

    # configure root logger
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose
    logger = misc.get_logger(log_file=log_file, quiet=quiet,
                             verbose=verbose)

    result = util.read_gopca_result(gopca_file)
    
    sig_matrix = util.read_gopca_result(gopca_file)

    sig_labels = [sig.get_label(include_id=False)
                  for sig in sig_matrix.signatures]

    matrix = ExpMatrix(genes=sig_labels, samples=sig_matrix.samples,
                       X=sig_matrix.X)
    matrix.index.name = 'Signatures'
    #signatures = result.signatures
    #sig_labels = [sig.get_label(max_name_length=sig_max_len, include_id=False)
    #              for sig in signatures]
    #samples = list(result.samples)

    # generate expression matrix
    #E = ExpMatrix(genes=sig_labels, samples=samples, X=sig_matrix.X)

    # clustering of signatures (rows)
    #E, _ = cluster.cluster_genes(E, reverse=sig_reverse_order)

    exp_logger = logging.getLogger(expression.__name__)
    exp_logger.setLevel(logging.WARNING)
    matrix.write_tsv(output_file)
    exp_logger.setLevel(logging.NOTSET)
    logger.info('Wrote %d x %d signature matrix to "%s".',
                matrix.p, matrix.n, output_file)

    return 0
Example #2
0
    def from_signatures(cls,
                        signatures,
                        standardize=False,
                        center=True,
                        use_median=True,
                        cluster_signatures=True,
                        signature_cluster_metric='correlation',
                        cluster_samples=True,
                        sample_cluster_metric='euclidean',
                        cluster_method='average'):
        """Generate a GO-PCA signature matrix from individual signatures.

        The GO-PCA signature matrix contains the expression levels of all
        signatures (rows) generated, across all samples (columns) in the
        analysis. See the documentation of the `GOPCASignature` class for
        details on how signature expression levels are calculated.

        Parameters
        ----------
        signatures: Iterable of `GOPCASignature`
        The signatures generated.
        """
        # TODO: finish docstring
        assert isinstance(signatures, Iterable)
        assert isinstance(standardize, bool)
        assert isinstance(center, bool)
        assert isinstance(use_median, bool)
        assert isinstance(cluster_signatures, bool)
        assert isinstance(cluster_samples, bool)

        ### generate the expression matrix
        matrix = ExpMatrix(
            pd.concat([
                sig.get_expression(standardize=standardize,
                                   center=center,
                                   use_median=use_median) for sig in signatures
            ],
                      axis=1).T)
        matrix.genes.name = 'Signatures'
        matrix.samples.name = 'Samples'

        if matrix.p == 1:
            cluster_signatures = False
            cluster_samples = False

        ### clustering
        if cluster_signatures:
            # cluster signatures
            matrix = cluster.cluster_genes(matrix,
                                           metric=signature_cluster_metric,
                                           method=cluster_method)

        order_samples = None
        if cluster_samples:
            # cluster samples
            matrix = cluster.cluster_samples(matrix,
                                             metric=sample_cluster_metric,
                                             method=cluster_method)

        return cls(matrix)
Example #3
0
def plot_read_count_distribution(barcode_count_file,
                                 output_file,
                                 xaxis_label=('# mapped reads '
                                              '(log<sub>10</sub>-scale)')):
    """Plot histogram of the distribution of reads per barcode.
    
    TODO: docstring"""
    matrix = ExpMatrix.read_tsv(barcode_count_file)

    x = np.float64(matrix.values.ravel())
    num_total_reads = int(np.sum(x))
    x[x < 1] = 1
    x = np.log10(x)

    data = [go.Histogram(x=x, nbinsx=100)]

    layout = go.Layout(
        title='Total number of mapped reads: %d' % num_total_reads,
        font=dict(
            size=20,
            family='serif',
        ),
        xaxis=dict(title=xaxis_label, ),
        yaxis=dict(
            title='# barcodes',
            type='log',
        ),
    )

    fig = go.Figure(data=data, layout=layout)
    plot(fig, filename=output_file, show_link=False, auto_open=False)
Example #4
0
def test_sparse(tmpdir, my_matrix):
    """Test reading/writing of sparse text format."""
    output_file = tmpdir.join('expression_matrix.mtx').strpath
    my_matrix.write_sparse(output_file)
    other = ExpMatrix.read_sparse(output_file)
    assert other is not my_matrix
    assert other == my_matrix
Example #5
0
def test_download(my_expression_file, my_gene_ontology_file,
                  my_fly_gene_set_file):
    """Test if required data files were downloaded successfully."""

    # expression file
    print(my_expression_file)
    assert os.path.isfile(my_expression_file)
    matrix = ExpMatrix.read_tsv(my_expression_file)
    assert isinstance(matrix, ExpMatrix)
    assert matrix.hash == 'aa7cc5e6e04d34e65058f059bcdfe5ea'

    # gene ontology file
    print(my_gene_ontology_file)
    assert os.path.isfile(my_gene_ontology_file)
    # hash not stable?
    #ontology = GeneOntology.read_obo(my_gene_ontology_file)
    #assert isinstance(ontology, GeneOntology)
    #assert ontology.hash == '978546899cfb0196ac2005d4b177725f'

    # gene set file
    print(my_fly_gene_set_file)
    assert os.path.isfile(my_fly_gene_set_file)
    gene_sets = GeneSetCollection.read_tsv(my_fly_gene_set_file)
    assert isinstance(gene_sets, GeneSetCollection)
    assert gene_sets.hash == '78b4b27e9658560a8e5993154d3228fa'
Example #6
0
def test_download(my_expression_file, my_gene_ontology_file,
                  my_fly_gene_set_file):
    """Test if required data files were downloaded successfully."""

    # expression file
    print(my_expression_file)
    assert os.path.isfile(my_expression_file)
    matrix = ExpMatrix.read_tsv(my_expression_file)
    assert isinstance(matrix, ExpMatrix)
    assert matrix.hash == 'aa7cc5e6e04d34e65058f059bcdfe5ea'

    # gene ontology file
    print(my_gene_ontology_file)
    assert os.path.isfile(my_gene_ontology_file)
    # hash not stable?
    #ontology = GeneOntology.read_obo(my_gene_ontology_file)
    #assert isinstance(ontology, GeneOntology)
    #assert ontology.hash == '978546899cfb0196ac2005d4b177725f'

    # gene set file
    print(my_fly_gene_set_file)
    assert os.path.isfile(my_fly_gene_set_file)
    gene_sets = GeneSetCollection.read_tsv(my_fly_gene_set_file)
    assert isinstance(gene_sets, GeneSetCollection)
    assert gene_sets.hash == '78b4b27e9658560a8e5993154d3228fa'
Example #7
0
def test_tsv(tmpdir, my_matrix):
    output_file = tmpdir.join('expression_matrix.tsv').strpath
    my_matrix.write_tsv(output_file)
    # data = open(str(path), mode='rb').read()
    # h = hashlib.md5(data).hexdigest()
    # assert h == 'd34bf3d376eb613e4fea894f7c9d601f'
    other = ExpMatrix.read_tsv(output_file)
    assert other is not my_matrix
    assert other == my_matrix
Example #8
0
def test_tsv(tmpdir, my_matrix):
    output_file = tmpdir.join('expression_matrix.tsv').strpath
    my_matrix.write_tsv(output_file)
    # data = open(str(path), mode='rb').read()
    # h = hashlib.md5(data).hexdigest()
    # assert h == 'd34bf3d376eb613e4fea894f7c9d601f'
    other = ExpMatrix.read_tsv(output_file)
    assert other is not my_matrix
    assert other == my_matrix
Example #9
0
def my_matrix_filtered(my_expression_file):
    matrix = ExpMatrix.read_tsv(my_expression_file)
    matrix_filtered = filter_variance(matrix, 8000)
    return matrix_filtered
Example #10
0
def my_matrix_filtered(my_expression_file):
    matrix = ExpMatrix.read_tsv(my_expression_file)
    matrix_filtered = filter_variance(matrix, 8000)
    return matrix_filtered
Example #11
0
def main(args=None):
    """Run GO-PCA and store the result in a `pickle` file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
 
    Raises
    ------
    SystemError
        If the version of the Python interpreter is not >= 2.7.
    """
    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        # read arguments from the command line
        parser = get_argument_parser()

        # parse first with default options, in case "--help" is specified
        # ("--help" causes the program to exit at this point)
        args = parser.parse_args()

        # now remove the defaults and parse again
        # (removing the defaults is important so that we know which values
        # were specified by the user)
        no_defaults = dict([p, None] for p in GOPCA.get_param_defaults())
        no_defaults2 = dict([p, None]
                            for p in GOPCAParams.get_param_defaults())
        no_defaults.update(no_defaults2)
        parser.set_defaults(**no_defaults)
        args = parser.parse_args()

    # reporting options
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # test if we can write to log_file?

    # configure root logger
    logger = util.get_logger(log_file=log_file, quiet=quiet)

    # check if required parameters were specified
    passed = True
    if args.expression_file is None:
        logger.error('No expression file specified!')
        passed = False
    if args.gene_set_file is None:
        logger.error('No gene set file specified!')
        passed = False
    if args.output_file is None:
        logger.error('No output file specified!')
        passed = False
    if not passed:
        logger.error('Not all required parameters were specified.')
        return 1

    # generate configuration
    if args.config_file is not None:
        # read parameter values from config file
        params = GOPCAParams.read_ini(args.config_file)
    else:
        # start with default configuration
        params = GOPCAParams()

    # overwrite parameters specified on the command line
    for p in GOPCAParams.get_param_defaults():
        v = getattr(args, p)
        if v is not None:
            logger.debug('Parameter "%s" specified on command line!', p)
            params.set_param(p, v)

    global_params = GOPCA.get_param_defaults()
    for k in list(global_params.keys()):
        v = getattr(args, k)
        if v is not None:
            logger.debug('Parameter "%s" specified on command line!', p)
            global_params[k] = v

    # read expression file
    matrix = ExpMatrix.read_tsv(args.expression_file)
    logger.info(
        'Expression matrix size: ' + '(p = %d genes) x (n = %d samples).',
        matrix.p, matrix.n)

    if args.sel_var_genes > 0:
        # filter genes by variance
        matrix = matrix.filter_variance(args.sel_var_genes)

    # read gene set file
    gene_sets = GeneSetCollection.read_tsv(args.gene_set_file)
    print(args.gene_set_file, gene_sets)

    # read ontology file (if supplied)
    gene_ontology = None
    if args.gene_ontology_file is not None:
        p_logger = logging.getLogger(genometools.__name__)
        p_logger.setLevel(logging.ERROR)
        gene_ontology = GeneOntology.read_obo(
            args.gene_ontology_file, part_of_cc_only=params.go_part_of_cc_only)
        p_logger.setLevel(logging.NOTSET)

    M = GOPCA.simple_setup(matrix,
                           params,
                           gene_sets,
                           gene_ontology,
                           verbose=verbose,
                           **global_params)
    run = M.run()

    if run is None:
        logger.error('GO-PCA run failed!')
        return 1

    # write run to pickle file
    logger.info('Storing GO-PCA run in file "%s"...', args.output_file)
    run.write_pickle(args.output_file)

    return 0
Example #12
0
def rma(cdf_file,
        sample_cel_files,
        pm_probes_only=True,
        bg_correct=True,
        quantile_normalize=True,
        medianpolish=True):
    """Perform RMA on a set of samples.

    Parameters
    ----------
    cdf_file: str
        The path of the Brainarray CDF file to use.
        Note: Brainarray CDF files can be downloaded from
            http://brainarray.mbni.med.umich.edu/Brainarray/Database/CustomCDF/genomic_curated_CDF.asp
    sample_cel_files: collections.OrderedDict (st => str)
        An ordered dictionary where each key/value-pair corresponds to a
        sample. The *key* is the sample name, and the *value* is the (absolute)
        path of the corresponding CEL file. The CEL files can be gzip'ed.
    pm_probes_only: bool, optional
        Whether or not to only use PM (perfect match) probes and ignore all MM
        (mismatch) probes. [True]
    bg_correct: bool, optional
        Whether or not to apply background correction. [True]
    quantile_normalize: bool, optional
        Whether or not to apply quantile normalization. [True]
    medianpolish: bool, optional
        Whether or not to apply medianpolish. [True]

    Returns
    -------
    genes: tuple of str
        The list of gene names.
    samples: tuple of str
        The list of sample names.
    X: np.ndarray (ndim = 2, dtype = np.float32)
        The expression matrix (genes-by-samples).

    Examples
    --------
    >>> from collections import OrderedDict
    >>> import pyaffy
    >>> cdf_file = '/path/to/brainarray/cdf/HGU133Plus2_Hs_ENTREZG.cdf'
    >>> sample_cel_files = OrderedDict([
            ['Sample 1', '/path/to/sample_1.CEL.gz'],
            ['Sample 2', '/path/to/sample_2.CEL.gz'],
        ])
    >>> genes, samples, X = pyaffy.rma(cdf_file, sample_cel_files)
    """

    ### checks
    assert isinstance(cdf_file, (str, _oldstr))
    assert os.path.isfile(cdf_file), \
            'CDF file "%s" does not exist!' %(cdf_file)

    assert isinstance(sample_cel_files, collections.OrderedDict)
    for sample, cel_file in sample_cel_files.items():
        assert isinstance(sample, (str, _oldstr))
        assert isinstance(cel_file, (str, _oldstr))
        assert os.path.isfile(cel_file), \
                'CEL file "%s" does not exist!' %(cel_file)

    assert isinstance(pm_probes_only, bool)
    assert isinstance(bg_correct, bool)
    assert isinstance(quantile_normalize, bool)
    assert isinstance(medianpolish, bool)

    t00 = time.time()

    ### read CDF data
    logger.info('Parsing CDF file.')
    t0 = time.time()
    # parse the CDF file
    probe_type = 'pm'
    if not pm_probes_only:
        probe_type = 'all'
    name, num_rows, num_cols, pm_probesets = \
            parse_cdf(cdf_file, probe_type=probe_type)

    # concatenate indices of all PM probes into one long vector
    pm_sel = np.concatenate(list(pm_probesets.values()))

    t1 = time.time()
    logger.info('CDF file parsing time: %.2f s', t1 - t0)
    logger.info('CDF array design name: %s', name)
    logger.info('CDF rows / columns: %d x %d', num_rows, num_cols)

    ### read CEL data
    logger.info('Parsing CEL files...')
    t0 = time.time()
    p = pm_sel.size
    n = len(sample_cel_files)
    Y = np.empty((p, n), dtype=np.float32)

    samples = []
    sub_logger = logging.getLogger(celparser.__name__)
    sub_logger.setLevel(logging.WARNING)
    for j, (sample, cel_file) in enumerate(sample_cel_files.items()):
        logger.debug('Parsing CEL file for sample "%s": %s', sample, cel_file)
        samples.append(sample)
        y = parse_cel(cel_file)
        Y[:, j] = y[pm_sel]
    sub_logger.setLevel(logging.NOTSET)
    t1 = time.time()
    logger.info('CEL files parsing time: %.1f s.', t1 - t0)

    ### background correction
    if bg_correct:
        logger.info('Performing background correction...')
        t0 = time.time()
        Y = rma_bg_correct(Y)
        t1 = time.time()
        logger.info('Background correction time: %.1f s.', t1 - t0)
    else:
        logger.info('Skipping background correction.')

    matrix = ExpMatrix(genes=pm_sel, samples=samples, X=Y)

    ### quantile normalization
    if quantile_normalize:
        logger.info('Performing quantile normalization...')
        t0 = time.time()
        matrix = qnorm(matrix)
        t1 = time.time()
        logger.info('Quantile normalization time: %.1f s.', t1 - t0)
    else:
        logger.info('Skipping quantile normalization.')

    ### convert intensities to log2-scale
    Y = np.log2(matrix.values)

    ### probeset summarization (with or without median polish)
    method = 'with'
    if not medianpolish:
        method = 'without'
    logger.info('Summarize probeset intensities (%s medianpolish)...', method)

    t0 = time.time()
    p = len(pm_probesets)
    n = Y.shape[1]
    X = np.empty((p, n), dtype=np.float32)
    cur = 0
    num_converged = 0
    genes = []
    for i, (gene_id, probes) in enumerate(pm_probesets.items()):
        genes.append(gene_id)

        if medianpolish:
            #X_sub = np.ascontiguousarray(Y[cur:(cur + probes.size),:])
            X_sub = Y[cur:(cur + probes.size), :]
            _, row_eff, col_eff, global_eff, converged, num_iter = medpolish(
                X_sub, copy=False)
            X[i, :] = col_eff + global_eff
            if converged:
                num_converged += 1

        else:
            # simply use median across probes
            X[i, :] = np.median(Y[cur:(cur + probes.size), :], axis=0)
            #X[i,:] = np.ma.median(X_sub, axis = 0)

        cur += probes.size

    t1 = time.time()
    logger.info('Probeset summarization time: %.2f s.', t1 - t0)

    if medianpolish:
        logger.debug('Converged: %d / %d (%.1f%%)', num_converged, p,
                     100 * (num_converged / float(p)))

    ### report total time
    t11 = time.time()
    logger.info('Total RMA time: %.1f s.', t11 - t00)

    ### sort alphabetically by gene name
    a = np.lexsort([genes])
    genes = [genes[i] for i in a]
    X = X[a, :]

    return genes, samples, X
Example #13
0
    def get_heatmap(self,
                    sig_matrix=None,
                    standardize=False,
                    center=True,
                    use_median=True,
                    include_id=False,
                    include_stats=True,
                    include_pval=True,
                    cluster_genes=True,
                    gene_cluster_metric='correlation',
                    cluster_samples=True,
                    sample_cluster_metric='euclidean',
                    cluster_method='average',
                    colorbar_label=None,
                    **kwargs):
        """Generate a heatmap of the signature gene matrix."""
        # TODO: Finish docstring

        assert isinstance(cluster_genes, bool)
        assert isinstance(cluster_samples, bool)
        assert isinstance(gene_cluster_metric, (str, _oldstr))
        assert isinstance(sample_cluster_metric, (str, _oldstr))
        assert isinstance(cluster_method, (str, _oldstr))

        from . import GOPCASignatureMatrix
        if sig_matrix is not None:
            assert isinstance(sig_matrix, GOPCASignatureMatrix)

        if colorbar_label is None:
            colorbar_label = 'Centered expression'

        matrix = self.matrix.copy()
        if standardize:
            matrix.standardize_genes(inplace=True)
            cb_default_label = ('Standardized expression<br>'
                                '(based on log<sub>2</sub>-scale)')
        elif center:
            matrix.center_genes(use_median=use_median, inplace=True)
            cb_default_label = 'Centered expression<br>(log<sub>2</sub>-scale)'
        else:
            cb_default_label = 'Expression<br>(log<sub>2</sub>-scale)'

        if colorbar_label is None:
            colorbar_label = cb_default_label

        # clustering
        if sig_matrix is not None:
            # user has provided a GOPCASignatureMatrix instance
            # make sure its samples match the signature's
            logger.info('Ordering samples to match order in signature matrix.')
            assert set(sig_matrix.samples) == set(self.samples.values)

            # re-arrange samples according to clustering of signature matrix
            matrix = matrix.loc[:, sig_matrix.samples]

        elif cluster_samples:
            # cluster samples (only if no signature matrix is provided)
            matrix = cluster.cluster_samples(matrix,
                                             metric=sample_cluster_metric,
                                             method=cluster_method)

        if cluster_genes:
            # cluster genes
            matrix = cluster.cluster_genes(matrix,
                                           metric=gene_cluster_metric,
                                           method=cluster_method)

        # add a "Signature"-labeled row to the top,
        # which represents the signature expression vector
        title = self.get_label(include_id=include_id,
                               include_stats=include_stats,
                               include_pval=include_pval)
        mean = np.mean(matrix.X, axis=0)
        header_row = ExpMatrix(genes=['<b>Signature</b>'],
                               samples=matrix.samples,
                               X=np.atleast_2d(mean))
        combined_matrix = pd.concat([header_row, matrix], axis=0)

        heatmap = ExpHeatmap(combined_matrix,
                             title=title,
                             colorbar_label=colorbar_label,
                             **kwargs)

        return heatmap
Example #14
0
 def __init__(self, *args, **kwargs):
     return ExpMatrix.__init__(self, *args, **kwargs)
Example #15
0
def run_pipeline_old(config_file):
    """inDrop pipeline."""

    t0 = time.time()

    conf, errors = config.read_config(config_file)

    input_ = conf['input']
    output = conf['output']
    params = conf['parameters']
    pipeline = conf['pipeline']

    output_dir = output['output_dir']

    barcode1_file = resource_filename(
        'singlecell', 'data/indrop/gel_barcode1_list.txt')
    barcode2_file = resource_filename(
        'singlecell', 'data/indrop/gel_barcode2_list.txt')

    if not util.is_empty_dir(output_dir):
        if output['allow_nonempty_output_dir']:
            _LOGGER.info('Note: Output directory exists and is not empty.')
        else:
            _LOGGER.error(
                'Output directory is not empty! Either specify an empty '
                '(or non-existent) output directory, or specify '
                '"allow_nonempty_output_dir: yes" in the configuration file.')
            return 1

    # create a timestamp for this run
    timestamp = time.strftime('%Y-%m-%d_%H-%M-%S')

    # create output directory, if necessary
    misc.make_sure_dir_exists(output_dir)

    # create results directory
    results_dir = os.path.join(output_dir, 'results')
    misc.make_sure_dir_exists(results_dir)

    # add file handler to the _LOGGER
    pipeline_log_file = os.path.join(results_dir, 'pipeline_log.txt')
    file_handler = logging.FileHandler(pipeline_log_file)
    log_fmt = '[%(asctime)s] %(levelname)s: %(message)s'
    log_datefmt = '%Y-%m-%d %H:%M:%S'
    formatter = logging.Formatter(log_fmt,log_datefmt)
    file_handler.setFormatter(formatter)
    _LOGGER.addHandler(file_handler)

    _LOGGER.info('This is the inDrop pipeline of SingleCell v%s', __version__)
    _LOGGER.info('Pipeline run timestamp: %s', timestamp)

    if params['use_docker']:
        _LOGGER.info('We\'re running using docker!')
    else:
        _LOGGER.info('We\'re running without using docker!')

    # create plot directory
    plot_dir = os.path.join(results_dir, 'qc_plots')
    misc.make_sure_dir_exists(plot_dir)

    # copy configuration file to results directory
    output_config_file = os.path.join(results_dir,
                                      'pipeline_config_%s.yaml' % timestamp)
    _LOGGER.info('Copying configuration file to "%s"', output_config_file)
    shutil.copyfile(config_file, output_config_file)
    
    ### process reads
    processed_read_dir = os.path.join(output_dir, 'processed_reads')
    misc.make_sure_dir_exists(processed_read_dir)
    process_read_file = os.path.join(
        processed_read_dir, 'processed_reads.fastq')
    process_count_file = os.path.join(results_dir, 'barcode_counts_reads.tsv')

    if pipeline['skip_read_processing']:
        _LOGGER.info('Skipping read processing step!')
    
    else:
        _LOGGER.info('Processing reads...')
        reads.process_reads(
            input_['barcode_read_file'], input_['mrna_read_file'],
            barcode1_file, barcode2_file,
            process_read_file, process_count_file,
            max_reads=params['max_reads'])
        _LOGGER.info('Finished processing reads.')

    
    ### mapping reads with STAR
    barcode_counts_mapped_file = os.path.join(results_dir,
                                              'barcode_counts_mapped.tsv')
    map_script_file = os.path.join(results_dir, 'map_with_star.sh')
    map_log_file = os.path.join(results_dir, 'mapping_log.txt')
    mapping_dir = os.path.join(output_dir, 'aligned_reads')

    alignment_file = os.path.join(mapping_dir, 'Aligned.sortedByCoord.out.bam')


    if pipeline['skip_mapping']:
        _LOGGER.info('Skipping read mapping step!')

    else:
        # mapping
        _LOGGER.info('Mapping reads with STAR...')
        star_params = conf['STAR']
        mapping.map_with_star(process_read_file, input_['star_index_dir'],
                              map_script_file, map_log_file,
                              mapping_dir,
                              num_threads=params['num_threads'],
                              compressed=False,
                              use_docker=params['use_docker'],
                              **star_params)
        _LOGGER.info('Finished mapping reads.')

        # count mapped reads for each barcode
        _LOGGER.info('Counting mapped reads for each barcode...')
        barcodes.count_mapped_reads(
            alignment_file,
            barcode1_file, barcode2_file,
            barcode_counts_mapped_file)
        _LOGGER.info('Finished counting mapped reads for each barcode.')

    ### generate intermediate files (chromosome lengths; protein-coding genes)
    chromlen_file = os.path.join(results_dir, 'chromosome_lengths.tsv')
    gene_file = os.path.join(results_dir, 'genes.tsv')
    if (not pipeline['skip_aligned_read_processing']) or \
            (not pipeline['skip_expression_quantification']):

        logger = logging.getLogger('genometools.ensembl')
        logger.setLevel(logging.ERROR)

        # generate file containing chromosome lengths
        _LOGGER.info('Extracting chromosome lengths...')
        chromlen = ensembl.get_chromosome_lengths(input_['genome_file'])
        chromlen.to_csv(chromlen_file, sep='\t', header=True)
        _LOGGER.info('Finished extracting chromosome lengths.')

        # generate file containing protein-coding genes
        _LOGGER.info('Extracting list of protein-coding genes from Ensembl GTF'
                     ' file...')
        protein_coding_genes = ensembl.get_protein_coding_genes(
            input_['genome_annotation_file'])
        _LOGGER.info('Finished extracting list of protein-coding genes.')

        if params['include_lincRNA_genes']:
            # extract lincRNA genes
            _LOGGER.info('Extracting list of lincRNA genes from Ensembl GTF'
                        ' file...')
            linc_rna_genes = ensembl.get_linc_rna_genes(
                input_['genome_annotation_file'])
            _LOGGER.info('Finished extracting list of lincRNA genes.')
            # exclude lincRNA whose gene name clashes with that of a
            # protein-coding gene
            sel = ~linc_rna_genes['name'].isin(
                set(protein_coding_genes['name']))
            linc_rna_genes = linc_rna_genes.loc[sel]
            genes = pd.concat([protein_coding_genes, linc_rna_genes])
        else:
            genes = protein_coding_genes
        
        genes.to_csv(gene_file, sep='\t', index=False)

    ### process aligned reads
    read_info_dir = os.path.join(output_dir, 'read_info')
    if not pipeline['skip_aligned_read_processing']:
        _LOGGER.info('Processing aligned reads...')
        misc.make_sure_dir_exists(read_info_dir)
        aligned_reads.process_aligned_reads(
            alignment_file, chromlen_file,
            gene_file, input_['genome_annotation_file'], read_info_dir,
            num_jobs=params['num_threads'])
        _LOGGER.info('Finished processing of aligned reads.')
    else:
        _LOGGER.info('Skipping processing of aligned reads!')

    ### quantify gene and transcript expression
    num_cells = params['num_cells']

    gene_expression_file = os.path.join(
        results_dir, 'gene_expression.mtx')
    transcript_expression_file = os.path.join(
        results_dir, 'transcript_expression.mtx')

    dense_gene_expression_file = None
    if output['generate_dense_expression_matrix']:
        dense_gene_expression_file = os.path.join(
            results_dir, 'gene_expression.tsv')

    if not pipeline['skip_expression_quantification']:
        _LOGGER.info('Quantifying expression for top %d cells...', num_cells)
        expression.quantify_expression(
            barcode_counts_mapped_file,
            chromlen_file,
            read_info_dir,
            gene_file, input_['genome_annotation_file'],
            num_cells,
            gene_expression_file, transcript_expression_file,
            min_umi_qual=params['min_umi_qual'],
            cell_prefix=output['cell_prefix'],
            dense_gene_expression_output_file=dense_gene_expression_file)
        _LOGGER.info('Finished expression quantification.')
    else:
        _LOGGER.info('Skipping expression quantification!')


    ### QC scripts
    if pipeline['skip_qc_plot_generation']:
        _LOGGER.info('Skipping the generation of QC plots!')

    else:
        _LOGGER.info('Generating QC plots...')

        _LOGGER.info('Plotting distribution of mapped reads per barcode...')
        mapped_count_histogram_file = \
                os.path.join(plot_dir, 'mapped_reads_histogram.html')
        barcodes.plot_read_count_distribution(
            barcode_counts_mapped_file, mapped_count_histogram_file)

        _LOGGER.info('Plotting distribution of transcripts per cell...')
        output_file = os.path.join(plot_dir, 'transcripts_per_cell.html')
        matrix = ExpMatrix.read_sparse(gene_expression_file)\
                .astype(np.float64)
        fig = qc.plot_cell_transcript_distribution(
            matrix, output['experiment_name'])
        plot(fig, filename=output_file, show_link=False, auto_open=False)

        _LOGGER.info('Plotting fraction of ribosomal and mitochondrial '
                     'gene expression...')
        output_file = os.path.join(plot_dir,
                                   'mito_ribo_expression.html')
        matrix = ExpMatrix.read_sparse(gene_expression_file)\
                .astype(np.float64)   # redundant
        fig = qc.plot_transcriptome_components(
            matrix, species=params['species'], name=output['experiment_name'],
            width=950, height=800, font_size=16, font_family='serif')
        plot(fig, filename=output_file, show_link=False, auto_open=False)

        _LOGGER.info('Plotting saturation...')
        output_file = os.path.join(plot_dir,
                                   'saturation.html')
        matrix = ExpMatrix.read_sparse(transcript_expression_file)\
                .astype(np.float64)
        fig = qc.plot_saturation(matrix)
        plot(fig, filename=output_file, show_link=False, auto_open=False)


    #_LOGGER.removeHandler(file_handler)
    t1 = time.time()
    t = t1 - t0
    _LOGGER.info('Pipeline run finished in %.1f s (%.1f min)!', t, t/60)
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    expression_file = args.expression_file
    entrez2gene_file = args.entrez2gene_file
    gene_file = args.gene_file
    output_file = args.output_file

    strip_affy_suffix = args.strip_affy_suffix

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read data
    genome = ExpGeneTable.read_tsv(gene_file)
    matrix = ExpMatrix.read_tsv(expression_file)
    e2g = dict(misc.read_all(entrez2gene_file))

    entrez = matrix.genes

    if strip_affy_suffix:
        # remove "_at" suffix from Entrez IDs
        entrez = [e[:-3] for e in entrez]
    logger.debug(str(entrez[:3]))

    # check that Entrez IDs are unique
    assert len(entrez) == len(set(entrez))

    # convert Entrez IDs to gene names
    f = 0
    genes = []
    X = []

    # g = None
    for i, e in enumerate(entrez):
        # print e
        try:
            g = e2g[e]
        except KeyError:
            f += 1
        else:
            # check if there are multiple entrez IDs pointing to the same gene
            # assert g not in genes
            genes.append(g)
            X.append(matrix.X[i, :])
    assert len(genes) == len(set(genes))
    if f > 0:
        logger.warning(
            'Failed to convert %d / %d entrez IDs '
            'to gene symbols (%.1f%%).', f, matrix.p,
            100 * (f / float(matrix.p)))

    # filter for known protein-coding genes
    X = np.float64(X)
    p = X.shape[0]
    logger.debug(str(X.shape))
    sel = np.zeros(p, dtype=np.bool_)
    for i in range(p):
        if genes[i] in genome:
            sel[i] = True
    sel = np.nonzero(sel)[0]
    genes = [genes[i] for i in sel]
    X = X[sel, :]
    f = p - sel.size
    if f > 0:
        logger.warning(
            'Failed to find %d / %d gene symbols in list of '
            'protein-coding genes (%.1f%%)', f, p, 100 * (f / float(p)))

    # generate new matrix (this automatically sorts the genes alphabetically)
    logger.debug('Genes: %d, Samples: %d, matrix: %s', len(genes),
                 len(matrix.samples), str(X.shape))
    matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X)

    # write output file
    matrix_conv.write_tsv(output_file)

    return 0
Example #17
0
 def __init__(self, *args, **kwargs):
     return ExpMatrix.__init__(self, *args, **kwargs)
Example #18
0
def my_matrix():
    genes = ['a', 'b', 'c', 'd', 'e', 'f']
    samples = ['s1', 's2', 's3']
    X = np.arange(18, dtype=np.float64).reshape(6, 3)
    matrix = ExpMatrix(genes=genes, samples=samples, X=X)
    return matrix
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError(
            "Python interpreter version >= 2.7 required, " "found %d.%d instead." % (vinfo.major, vinfo.minor)
        )

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    expression_file = args.expression_file
    entrez2gene_file = args.entrez2gene_file
    gene_file = args.gene_file
    output_file = args.output_file

    strip_affy_suffix = args.strip_affy_suffix

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read data
    genome = ExpGenome.read_tsv(gene_file)
    matrix = ExpMatrix.read_tsv(expression_file)
    e2g = dict(misc.read_all(entrez2gene_file))

    entrez = matrix.genes

    if strip_affy_suffix:
        # remove "_at" suffix from Entrez IDs
        entrez = [e[:-3] for e in entrez]
    logger.debug(str(entrez[:3]))

    # check that Entrez IDs are unique
    assert len(entrez) == len(set(entrez))

    # convert Entrez IDs to gene names
    f = 0
    genes = []
    X = []

    # g = None
    for i, e in enumerate(entrez):
        # print e
        try:
            g = e2g[e]
        except KeyError:
            f += 1
        else:
            # check if there are multiple entrez IDs pointing to the same gene
            # assert g not in genes
            genes.append(g)
            X.append(matrix.X[i, :])
    assert len(genes) == len(set(genes))
    if f > 0:
        logger.warning(
            "Failed to convert %d / %d entrez IDs " "to gene symbols (%.1f%%).",
            f,
            matrix.p,
            100 * (f / float(matrix.p)),
        )

    # filter for known protein-coding genes
    X = np.float64(X)
    p = X.shape[0]
    logger.debug(str(X.shape))
    sel = np.zeros(p, dtype=np.bool_)
    for i in range(p):
        if genes[i] in genome:
            sel[i] = True
    sel = np.nonzero(sel)[0]
    genes = [genes[i] for i in sel]
    X = X[sel, :]
    f = p - sel.size
    if f > 0:
        logger.warning(
            "Failed to find %d / %d gene symbols in list of " "protein-coding genes (%.1f%%)",
            f,
            p,
            100 * (f / float(p)),
        )

    # generate new matrix (this automatically sorts the genes alphabetically)
    logger.debug("Genes: %d, Samples: %d, matrix: %s", len(genes), len(matrix.samples), str(X.shape))
    matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X)

    # write output file
    matrix_conv.write_tsv(output_file)

    return 0
Example #20
0
def main(args=None):
    """Run GO-PCA and store the result in a `pickle` file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
 
    Raises
    ------
    SystemError
        If the version of the Python interpreter is not >= 2.7.
    """
    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        # read arguments from the command line
        parser = get_argument_parser()

        # parse first with default options, in case "--help" is specified
        # ("--help" causes the program to exit at this point)
        args = parser.parse_args()

        # now remove the defaults and parse again
        # (removing the defaults is important so that we know which values
        # were specified by the user)
        no_defaults = dict([p, None] for p in GOPCA.get_param_defaults())
        no_defaults2 = dict([p, None] for p in GOPCAParams.get_param_defaults())
        no_defaults.update(no_defaults2)
        parser.set_defaults(**no_defaults)
        args = parser.parse_args()

    # reporting options
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # test if we can write to log_file?

    # configure root logger
    logger = util.get_logger(log_file=log_file, quiet=quiet)

    # check if required parameters were specified
    passed = True
    if args.expression_file is None:
        logger.error('No expression file specified!')
        passed = False
    if args.gene_set_file is None:
        logger.error('No gene set file specified!')
        passed = False
    if args.output_file is None:
        logger.error('No output file specified!')
        passed = False
    if not passed:
        logger.error('Not all required parameters were specified.')
        return 1

    # generate configuration
    if args.config_file is not None:
        # read parameter values from config file
        params = GOPCAParams.read_ini(args.config_file)
    else:
        # start with default configuration
        params = GOPCAParams()

    # overwrite parameters specified on the command line
    for p in GOPCAParams.get_param_defaults():
        v = getattr(args, p)
        if v is not None:
            logger.debug('Parameter "%s" specified on command line!', p)
            params.set_param(p, v)

    global_params = GOPCA.get_param_defaults()
    for k in list(global_params.keys()):
        v = getattr(args, k)
        if v is not None:
            logger.debug('Parameter "%s" specified on command line!', p)
            global_params[k] = v

    # read expression file
    matrix = ExpMatrix.read_tsv(args.expression_file)
    logger.info('Expression matrix size: ' +
                '(p = %d genes) x (n = %d samples).', matrix.p, matrix.n)

    if args.sel_var_genes > 0:
        # filter genes by variance
        matrix = matrix.filter_variance(args.sel_var_genes)
    
    # read gene set file
    gene_sets = GeneSetCollection.read_tsv(args.gene_set_file)
    print(args.gene_set_file, gene_sets)
    
    # read ontology file (if supplied)
    gene_ontology = None
    if args.gene_ontology_file is not None:
        p_logger = logging.getLogger(genometools.__name__)
        p_logger.setLevel(logging.ERROR)
        gene_ontology = GeneOntology.read_obo(
            args.gene_ontology_file,
            part_of_cc_only=params.go_part_of_cc_only)
        p_logger.setLevel(logging.NOTSET)
        
    M = GOPCA.simple_setup(matrix, params, gene_sets, gene_ontology,
                          verbose=verbose, **global_params)
    run = M.run()

    if run is None:
        logger.error('GO-PCA run failed!')
        return 1

    # write run to pickle file
    logger.info('Storing GO-PCA run in file "%s"...', args.output_file)
    run.write_pickle(args.output_file)

    return 0
Example #21
0
def my_matrix(my_gene_names, my_samples, my_X):
    #genes = ['a', 'b', 'c', 'd']
    #samples = ['s1', 's2', 's3']
    # X = np.arange(12, dtype=np.float64).reshape(4, 3)
    matrix = ExpMatrix(genes=my_gene_names, samples=my_samples, X=my_X)
    return matrix