コード例 #1
0
def from_peaks_bed(peaks_path, genomes):
    '''
    Create a FeatureReference from a peaks bed file

    Args:
        peaks_path (str): Path to peaks bed file. Can be None.
        genomes: This should be the genomes on which peaks were defined. These would be prefixes in peak names
    '''

    # Load peaks info
    feature_defs = []
    all_tag_keys = ['genome', 'derivation']

    if peaks_path:
        # Stuff relevant fields of peak tuple into FeatureDef
        peaks = None
        with open(peaks_path, 'rU') as pf:
            peaks = [
                "{}:{}-{}".format(*line.strip("\n").split("\t")) for line in pf
            ]
        for peak in peaks:
            genome = get_genome_from_contig(peak, genomes)
            feature_defs.append(
                FeatureDef(index=len(feature_defs),
                           id=peak,
                           name=peak,
                           feature_type=lib_constants.ATACSEQ_LIBRARY_TYPE,
                           tags={
                               'genome': genome,
                               'derivation': ''
                           }))

    return FeatureReference(feature_defs, all_tag_keys)
コード例 #2
0
def from_motif_list(motif_list):
    '''
    Create a FeatureReference from a motifs list.

    Args:
        motif_list (list): list of motif names. Can be None.
    '''

    # Load peaks info
    feature_defs = []
    all_tag_keys = ['genome', 'derivation']

    if motif_list:
        # Stuff into FeatureDef
        for motif in motif_list:
            genome = get_genome_from_motif(motif)
            feature_defs.append(
                FeatureDef(
                    index=len(feature_defs),
                    id=motif,
                    name=get_name_from_motif(motif),
                    feature_type=lib_constants.ATACSEQ_LIBRARY_DERIVED_TYPE,
                    tags={
                        'genome': genome,
                        'derivation': 'POOL'
                    }))

    return FeatureReference(feature_defs, all_tag_keys)
コード例 #3
0
 def _update_feature_ref(self):
     """Make the feature reference consistent with the feature mask"""
     indices = np.flatnonzero(self.feature_mask)
     self.feature_ref = FeatureReference(
         feature_defs=[
             self.matrix.feature_ref.feature_defs[i] for i in indices
         ],
         all_tag_keys=self.matrix.feature_ref.all_tag_keys)
コード例 #4
0
def from_bed_and_motifs(peaks_path, motif_list, genomes):
    '''
    Create a FeatureReference from a bed file of peaks and then from a motifs list.

    Args:
        peaks_path (str): bed file of peaks. Can be None.
        motif_list (list): list of motif names. Can be None.
        genome: This should be the genome for which the peaks and motifs are identified.
    '''

    # Load peaks info
    feature_defs = []
    all_tag_keys = ['genome', 'derivation']

    # process peaks
    if peaks_path:
        # Stuff relevant fields of peak tuple into FeatureDef
        peaks = None
        with open(peaks_path, 'rU') as pf:
            peaks = [
                "{}:{}-{}".format(*line.strip("\n").split("\t")) for line in pf
            ]
        for peak in peaks:
            genome = get_genome_from_contig(peak, genomes)
            feature_defs.append(
                FeatureDef(index=len(feature_defs),
                           id=peak,
                           name=peak,
                           feature_type=lib_constants.ATACSEQ_LIBRARY_TYPE,
                           tags={
                               'genome': genome,
                               'derivation': ''
                           }))

    # process motifs
    if motif_list:
        # Stuff into FeatureDef
        for motif in motif_list:
            genome = get_genome_from_motif(motif)
            feature_defs.append(
                FeatureDef(
                    index=len(feature_defs),
                    id=motif,
                    name=get_name_from_motif(motif),
                    feature_type=lib_constants.ATACSEQ_LIBRARY_DERIVED_TYPE,
                    tags={
                        'genome': genome,
                        'derivation': 'POOL'
                    }))

    return FeatureReference(feature_defs, all_tag_keys)
コード例 #5
0
def from_transcriptome_and_csv(gene_ref_path, feature_def_filename):
    '''Create a FeatureReference.

    Create a FeatureReference from a transcriptome ref and a feature barcode ref.

    Args:
        gene_ref_path (str): Path to transcriptome reference. Can be None.
        feature_def_filename (str): Path to Feature Definition CSV file. Can be None.
    Returns:
        FeatureReference
    '''

    # Load gene info
    feature_defs = []
    all_tag_keys = ['genome']

    genomes = cr_utils.get_reference_genomes(gene_ref_path)

    if gene_ref_path is not None:
        gene_idx_filename = cr_utils.get_reference_genes_index(gene_ref_path)
        gene_index = cr_reference.GeneIndex.load_pickle(gene_idx_filename)

        # Stuff relevant fields of Gene tuple into FeatureDef
        for gene in gene_index.genes:
            genome = cr_utils.get_genome_from_str(gene.id, genomes)
            fd = FeatureDef(
                index=len(feature_defs),
                id=gene.id,
                name=gene.name,
                feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE,
                tags={
                    'genome': genome,
                })
            feature_defs.append(fd)

    # Load feature definition file
    if feature_def_filename is not None:
        csv_feature_defs, csv_tag_keys = parse_feature_def_file(
            feature_def_filename, index_offset=len(feature_defs))

        # check the CRISPR 'target_gene_id' field, if it exists
        # it needs to match a transcriptome entry
        check_crispr_target_gene(csv_feature_defs, feature_defs)

        feature_defs.extend(csv_feature_defs)
        all_tag_keys.extend(csv_tag_keys)

    return FeatureReference(feature_defs, all_tag_keys)
コード例 #6
0
        def build_feature_ref(gene_ids, gene_names, genome_index):
            feature_defs = []
            if len(genome_index) == 1:
                genome = genome_index.keys()[0]
                for idx, (gene_id, gene_name) in enumerate(zip(gene_ids, gene_names)):
                    feature_defs.append(FeatureDef(index=idx,
                                                   id=gene_id,
                                                   name=gene_name,
                                                   feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE,
                                                   tags={'genome': genome}))
            else:
                for idx, (gene_id, gene_name) in enumerate(zip(gene_ids, gene_names)):
                    genome = gene_id.split('_')[0]
                    feature_defs.append(FeatureDef(index=idx,
                                                   id=gene_id,
                                                   name=gene_name,
                                                   feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE,
                                                   tags={'genome': genome}))

            return FeatureReference(feature_defs, ['genome'])
コード例 #7
0
    def select_features(self, indices):
        '''Select a subset of features and return the resulting matrix.
        We also update FeatureDefs to keep their indices consistent with their new position'''

        old_feature_defs = [self.feature_ref.feature_defs[i] for i in indices]

        updated_feature_defs = [
            FeatureDef(index=i,
                       id=fd.id,
                       name=fd.name,
                       feature_type=fd.feature_type,
                       tags=fd.tags) for (i, fd) in enumerate(old_feature_defs)
        ]

        feature_ref = FeatureReference(
            feature_defs=updated_feature_defs,
            all_tag_keys=self.feature_ref.all_tag_keys)

        return CountMatrix(feature_ref=feature_ref,
                           bcs=self.bcs,
                           matrix=self.m[indices, :])
コード例 #8
0
    def from_v3_mtx(genome_dir):
        barcodes_tsv = os.path.join(genome_dir, "barcodes.tsv.gz")
        features_tsv = os.path.join(genome_dir, "features.tsv.gz")
        matrix_mtx = os.path.join(genome_dir, "matrix.mtx.gz")
        for filepath in [barcodes_tsv, features_tsv, matrix_mtx]:
            if not os.path.exists(filepath):
                raise IOError("Required file not found: %s" % filepath)
        barcodes = pd.read_csv(barcodes_tsv,
                               delimiter='\t',
                               header=None,
                               usecols=[0]).values.squeeze()
        features = pd.read_csv(features_tsv, delimiter='\t', header=None)

        feature_defs = []
        for (idx, (_, r)) in enumerate(features.iterrows()):
            fd = FeatureDef(idx, r[0], r[1], r[2], [])
            feature_defs.append(fd)

        feature_ref = FeatureReference(feature_defs, [])

        matrix = sp_io.mmread(matrix_mtx)
        mat = CountMatrix(feature_ref, barcodes, matrix)
        return mat
コード例 #9
0
    def from_legacy_mtx(genome_dir):
        barcodes_tsv = os.path.join(genome_dir, "barcodes.tsv")
        genes_tsv = os.path.join(genome_dir, "genes.tsv")
        matrix_mtx = os.path.join(genome_dir, "matrix.mtx")
        for filepath in [barcodes_tsv, genes_tsv, matrix_mtx]:
            if not os.path.exists(filepath):
                raise IOError("Required file not found: %s" % filepath)
        barcodes = pd.read_csv(barcodes_tsv,
                               delimiter='\t',
                               header=None,
                               usecols=[0]).values.squeeze()
        genes = pd.read_csv(genes_tsv,
                            delimiter='\t',
                            header=None,
                            usecols=[0]).values.squeeze()
        feature_defs = [
            FeatureDef(idx, gene_id, None, "Gene Expression", [])
            for (idx, gene_id) in enumerate(genes)
        ]
        feature_ref = FeatureReference(feature_defs, [])

        matrix = sp_io.mmread(matrix_mtx)
        mat = CountMatrix(feature_ref, barcodes, matrix)
        return mat
コード例 #10
0
 def load_feature_ref_from_h5_group(group):
     '''Load just the FeatureRef from an h5py.Group.'''
     feature_group = group[h5_constants.H5_FEATURE_REF_ATTR]
     return FeatureReference.from_hdf5(feature_group)
コード例 #11
0
    def from_legacy_v1_h5(cls, h5_file):
        """Create a CountMatrix from a legacy h5py.File (format version 1)"""

        genome_arrays = []
        gene_id_arrays = []
        gene_name_arrays = []
        bc_idx_arrays = []
        feat_idx_arrays = []
        data_arrays = []

        # Map barcode string to column index in new matrix
        barcode_map = OrderedDict()

        # Construct a genome-concatenated matrix and FeatureReference
        for genome_idx, genome in enumerate(h5_file.keys()):
            g = h5_file[genome]

            n_genes = sum(len(x) for x in gene_id_arrays)

            # Offset the row (gene) indices by the number of genes seen so far
            feat_idx_arrays.append(g['indices'][:] + n_genes)

            # Offset the col (barcode) indices by the number of nonzero elements seen so far

            # Map barcode (column) indices to a single unique barcode space
            barcodes = g['barcodes'][:]
            for bc in barcodes:
                if bc not in barcode_map:
                    barcode_map[bc] = len(barcode_map)

            remapped_col_inds = np.fromiter(
                (barcode_map[bc] for bc in barcodes),
                count=len(barcodes),
                dtype='uint64',
            )

            indptr = g['indptr'][:]
            assert len(indptr) == 1 + len(remapped_col_inds)

            if genome_idx == 0:
                # For the first set of barcodes encountered, there should
                # be no change in their new indices.
                assert np.array_equal(remapped_col_inds,
                                      np.arange(len(indptr) - 1))

            # Convert from CSC to COO by expanding the indptr array out

            nz_elems_per_bc = np.diff(indptr)
            assert len(nz_elems_per_bc) == len(g['barcodes'])

            bc_idx = np.repeat(remapped_col_inds, nz_elems_per_bc)
            assert len(bc_idx) == len(g['indices'])
            assert len(bc_idx) == len(g['data'])

            bc_idx_arrays.append(bc_idx)
            data_arrays.append(g['data'][:])

            gene_id_arrays.append(g['genes'][:])
            gene_name_arrays.append(g['gene_names'][:])
            genome_arrays.append(np.repeat(genome, len(g['genes'])))

        genomes = np.concatenate(genome_arrays)
        gene_ids = np.concatenate(gene_id_arrays)
        gene_names = np.concatenate(gene_name_arrays)

        # Construct FeatureReference
        feature_defs = []
        for (gene_id, gene_name,
             genome) in itertools.izip(gene_ids, gene_names, genomes):
            feature_defs.append(
                FeatureDef(
                    index=len(feature_defs),
                    id=gene_id,
                    name=gene_name,
                    feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE,
                    tags={'genome': genome}))
        feature_ref = FeatureReference(feature_defs, ['genome'])

        i = np.concatenate(feat_idx_arrays)
        j = np.concatenate(bc_idx_arrays)
        data = np.concatenate(data_arrays)

        assert (type(barcode_map) == OrderedDict)
        barcodes = barcode_map.keys()

        matrix = sp_sparse.csc_matrix((data, (i, j)),
                                      shape=(len(gene_ids), len(barcodes)))

        return CountMatrix(feature_ref, barcodes, matrix)
コード例 #12
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.filtered_peak_bc_matrix is None or not args.reduction_summary[
            'h5'].keys():
        outs.analysis = None
        outs.analysis_csv = None
        outs.feature_bc_matrix = None
        return

    # Make the FBM
    # build joint Peak + TF count matrix for single genomes
    # combine peak annotations for single genome analysis
    peak_annotation = None
    if args.peak_annotation:
        annotations = pd.read_csv(args.peak_annotation,
                                  sep='\t')[['gene', 'peak_type']]
        annotations = annotations.replace(np.nan, '', regex=True)
        annotations = annotations.values.astype(str).tolist()
        peak_annotation = []
        for row in annotations:
            genes = row[0].split(";")
            annotation = row[1].split(";")
            promoter = []
            nearby_gene = []
            assert len(annotation) == len(genes)
            for en, kind in enumerate(annotation):
                if kind == 'promoter':
                    promoter += [genes[en]]
                nearby_gene += [genes[en]]
            peak_annotation += [[';'.join(promoter), ';'.join(nearby_gene)]]
    fbm = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix)
    mapping = None
    if args.filtered_tf_bc_matrix:
        # combine matrices, ensure the barcodes are same and ordered the same way
        tf_matrix = cr_matrix.CountMatrix.load_h5_file(
            args.filtered_tf_bc_matrix)
        assert (fbm.bcs == tf_matrix.bcs).all()
        if peak_annotation is not None:
            fbm.feature_ref = FeatureReference.addtags(
                fbm.feature_ref, ['promoter', 'nearby_gene'], peak_annotation)
            tf_matrix.feature_ref = FeatureReference.addtags(
                tf_matrix.feature_ref, ['promoter', 'nearby_gene'])
        combined_feature_defs = FeatureReference.join(fbm.feature_ref,
                                                      tf_matrix.feature_ref)
        combined_matrix = vstack([fbm.m, tf_matrix.m])
        # explicit map linking rows in diffexp to combined matrix
        mapping = np.zeros((tf_matrix.features_dim, 2))
        for x in range(tf_matrix.features_dim):
            mapping[x, 0] = x
            mapping[x, 1] = x + fbm.features_dim
        fbm = cr_matrix.CountMatrix(combined_feature_defs, fbm.bcs,
                                    combined_matrix)
    fbm.save_h5_file(outs.feature_bc_matrix,
                     sw_version=martian.get_pipelines_version())

    # Pytables doesn't support variable len strings, so use h5py first
    with h5.File(outs.feature_bc_matrix, 'r') as matrix, \
            h5.File(outs.analysis, 'w') as out:
        # TODO: copy the first group; fixme when we have a key
        name = matrix.keys()[0]
        matrix.copy(matrix[name], out, name='matrix')

    factorizations = args.reduction_summary['h5'].keys()
    USE_FACTORIZATION = DEFAULT_FACTORIZATION if DEFAULT_FACTORIZATION in factorizations else factorizations[
        0]
    with tables.open_file(outs.analysis, 'a') as out:
        for summary, key in zip([
                args.reduction_summary, args.clustering_summary,
                args.tsne_summary, args.enrichment_analysis_summary
        ], [USE_FACTORIZATION, 'clustering', 'tsne', 'enrichment']):
            if summary is None or not summary:
                continue
            print(key, summary)
            data_h5 = summary['h5'][USE_FACTORIZATION]
            with tables.open_file(data_h5, 'r') as indata:
                indata.copy_children(indata.root, out.root, recursive=True)
            dirname = os.path.join(outs.analysis_csv, key)
            cr_io.copytree(summary['csv'][USE_FACTORIZATION], dirname)

    # if mapping is present (single genome case), so is the coloring matrix
    if mapping is not None:
        with h5.File(outs.analysis, 'a') as out:
            out.create_dataset('feature_DE_map', data=mapping)
        args.coerce_strings()
        tf_propZ_matrix = np.loadtxt(args.tf_propZ_matrix)
        with h5.File(outs.analysis, 'a') as out:
            out.create_dataset('diffexp_coloring_matrix', data=tf_propZ_matrix)
コード例 #13
0
 def get_feature_ref(self):
     return FeatureReference.from_hdf5(self.h5[h5_constants.H5_FEATURE_REF_ATTR])
コード例 #14
0
    def open(filename, mode, feature_ref=None, barcodes=None, library_info=None,
             barcode_info=None):
        """Open a molecule info object.

        Args:
          filename (str): Filename to open or create
          mode (str): 'r' for reading, 'w' for writing.
          feature_ref (FeatureReference): Required when mode is 'w'.
          barcodes (list of str): All possible barcode sequences. Required when mode is 'w'.
          library_info (list of dict): Library metadata. Required when mode is 'w'.
          barcode_info (BarcodeInfo): Per-barcode metadata.
        Returns:
          MoleculeInfo: A new object
        """
        assert mode == 'r' or mode == 'w'

        mc = MoleculeCounter()

        if mode == 'w':
            if feature_ref is None:
                raise ValueError('Feature reference must be specified when opening a molecule info object for writing')
            if barcodes is None:
                raise ValueError('Barcodes must be specified when opening a molecule info object for writing')
            if library_info is None:
                raise ValueError('Library info must be specified when opening a molecule info object for writing')
            if barcode_info is None:
                raise ValueError('Barcode info must be specified when opening a molecule info object for writing')

            mc.h5 = h5py.File(filename, 'w')
            cr_io.set_hdf5_attr(mc.h5, FILE_VERSION_KEY, CURR_FILE_VERSION)
            cr_io.set_hdf5_attr(mc.h5, h5_constants.H5_FILETYPE_KEY, MOLECULE_H5_FILETYPE)
            cr_io.set_hdf5_attr(mc.h5, FILE_VERSION_KEY, CURR_FILE_VERSION)

            mc.h5.create_group(METRICS_GROUP_NAME)

            # Write feature reference
            fref_group = mc.h5.create_group(h5_constants.H5_FEATURE_REF_ATTR)
            feature_ref.to_hdf5(fref_group)

            # Write barcodes
            # If there are multiple barcode lengths, use the largest for the numpy dtype.
            max_barcode_len = np.max(map(len, barcodes))
            barcode_dtype = np.dtype('S%d' % max_barcode_len)
            mc.h5.create_dataset('barcodes', data=np.fromiter(barcodes, barcode_dtype, count=len(barcodes)), compression=HDF5_COMPRESSION)

            # Write library info
            lib_info_json = json.dumps(library_info, indent=4, sort_keys=True)
            cr_io.create_hdf5_string_dataset(mc.h5, 'library_info', [lib_info_json])

            # Write barcode info
            g = mc.h5.create_group(BARCODE_INFO_GROUP_NAME)
            MoleculeCounter.save_barcode_info(barcode_info, g)

            # Create empty per-molecule datasets
            for name, col_type in MOLECULE_INFO_COLUMNS.iteritems():
                mc.columns[name] = mc.h5.create_dataset(name, (0,),
                                                        maxshape=(None,),
                                                        dtype=col_type,
                                                        compression=HDF5_COMPRESSION,
                                                        chunks=(HDF5_CHUNK_SIZE,))

        elif mode == 'r':
            mc.h5 = h5py.File(filename, 'r')

            try:
                mc.file_version = mc.h5.attrs[FILE_VERSION_KEY]
            except AttributeError:
                mc.file_version = 1 # V1 doesn't have version field

            if mc.file_version < CURR_FILE_VERSION:
                raise ValueError('The molecule info HDF5 file (format version %d) was produced by an older version of Cell Ranger. Reading these files is unsupported.' % mc.file_version)
            if mc.file_version > CURR_FILE_VERSION:
                raise ValueError('The molecule info HDF5 file (format version %d) was produced by an newer version of Cell Ranger. Reading these files is unsupported.' % mc.file_version)

            for key in mc.h5.keys():
                if key in MOLECULE_INFO_COLUMNS:
                    mc.columns[key] = mc.h5[key]
                elif key in MOLECULE_REF_COLUMNS:
                    mc.ref_columns[key] = mc.h5[key]
                elif key == h5_constants.H5_FEATURE_REF_ATTR:
                    mc.feature_reference = FeatureReference.from_hdf5(mc.h5[key])
                elif key == METRICS_GROUP_NAME \
                     or key == BARCODE_INFO_GROUP_NAME:
                    pass
                else:
                    raise AttributeError("Unrecognized dataset key: %s" % key)

            # Load library info
            mc.library_info = json.loads(cr_io.read_hdf5_string_dataset(mc.h5['library_info'])[0])

        return mc