def build_from_mol_counter(molecule_counter, subsample_rate=1.0, subsample_result=None): """ Construct a GeneBCMatrices object from a MoleculeCounter. Args: subsample_result (dict) - Return some metrics results into this dict. """ # Reconstruct all barcode sequences in the original matrices barcode_whitelist = cr_utils.load_barcode_whitelist(molecule_counter.get_barcode_whitelist()) barcode_length = molecule_counter.get_barcode_length() or len(barcode_whitelist[0]) gem_groups = molecule_counter.get_gem_groups() barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, gem_groups) # Reconstruct Gene tuples from the molecule info ref columns gene_ids = molecule_counter.get_ref_column('gene_ids') genome_ids = molecule_counter.get_ref_column('genome_ids') gene_names = molecule_counter.get_ref_column('gene_names') gene_tuples = [cr_constants.Gene(gid, gname, None, None, None) for (gid, gname) in itertools.izip(gene_ids, gene_names)] genes = cr_utils.split_genes_by_genomes(gene_tuples, genome_ids) matrices = GeneBCMatrices(genome_ids, genes, barcode_seqs) # Track results of subsampling reads = 0 for mol in molecule_counter.get_molecule_iter(barcode_length, subsample_rate=subsample_rate): matrices.add(mol.genome, mol.gene_id, mol.barcode) reads += mol.reads if subsample_result is not None: subsample_result['mapped_reads'] = reads return matrices
def load(group): gene_ids = list(getattr(group, cr_constants.H5_GENE_IDS_ATTR).read()) if hasattr(group, cr_constants.H5_GENE_NAMES_ATTR): gene_names = list( getattr(group, cr_constants.H5_GENE_NAMES_ATTR).read()) else: gene_names = gene_ids assert len(gene_ids) == len(gene_names) genes = [ cr_constants.Gene(id, name, None, None, None) for id, name in itertools.izip(gene_ids, gene_names) ] bcs = list(getattr(group, cr_constants.H5_BCS_ATTR).read()) matrix = GeneBCMatrix(genes, bcs) shape = getattr(group, cr_constants.H5_MATRIX_SHAPE_ATTR).read() data = getattr(group, cr_constants.H5_MATRIX_DATA_ATTR).read() indices = getattr(group, cr_constants.H5_MATRIX_INDICES_ATTR).read() indptr = getattr(group, cr_constants.H5_MATRIX_INDPTR_ATTR).read() # quick check to make sure indptr increases monotonically (to catch overflow bugs) assert np.all(np.diff(indptr) >= 0) matrix.m = sp_sparse.csc_matrix((data, indices, indptr), shape=shape) return matrix
def load_snps(filename): # HACK: Save SNPs as Gene tuples so we can reuse code in GeneBCMatrices with open(filename, 'r') as f: return [ cr_constants.Gene(str(snp), '', None, None, None) for snp in json.load(f) ]
def load_genes_from_h5_group(group): """ Load just the genes from an h5 """ gene_ids = list(getattr(group, cr_constants.H5_GENE_IDS_ATTR).read()) if hasattr(group, cr_constants.H5_GENE_NAMES_ATTR): gene_names = list(getattr(group, cr_constants.H5_GENE_NAMES_ATTR).read()) else: gene_names = gene_ids assert len(gene_ids) == len(gene_names) genes = [cr_constants.Gene(id, name, None, None, None) for id, name in itertools.izip(gene_ids, gene_names)] return genes
def load_mtx(genome_dir): barcodes_tsv = os.path.join(genome_dir, "barcodes.tsv") genes_tsv = os.path.join(genome_dir, "genes.tsv") matrix_mtx = os.path.join(genome_dir, "matrix.mtx") for filepath in [barcodes_tsv, genes_tsv, matrix_mtx]: if not os.path.exists(filepath): raise IOError("Required file not found: %s" % filepath) barcodes = pd.read_csv(barcodes_tsv, delimiter='\t', header=None, usecols=[0]).values.squeeze() genes = pd.read_csv(genes_tsv, delimiter='\t', header=None, usecols=[0]).values.squeeze() genes = [cr_constants.Gene(gene_id, None, None, None, None) for gene_id in genes] matrix = sp_io.mmread(matrix_mtx) gbm = GeneBCMatrix(genes, barcodes) gbm.m = matrix return gbm
def load_gtf(self, in_gtf_fn, fasta_parser=None): transcripts = {} gene_to_transcripts = collections.OrderedDict() for row, is_comment, properties in self.gtf_reader_iter(in_gtf_fn): if is_comment: continue chrom, _, annotation, start, end, _, strand, _, properties_str = row if annotation != "exon": continue start = int(start) - 1 end = int(end) length = abs(end - start) transcript_id = properties['transcript_id'] gene_id = properties['gene_id'] gene_name = properties.get('gene_name', gene_id) gene = cr_constants.Gene(gene_id, gene_name, None, None, None) if transcript_id not in transcripts: transcripts[transcript_id] = cr_constants.Transcript( gene, None, None, []) if gene not in gene_to_transcripts: gene_to_transcripts[gene] = set() assert transcripts[transcript_id].gene == gene transcripts[transcript_id].intervals.append( cr_constants.Interval(chrom, start, end, length, strand)) gene_to_transcripts[gene].add(transcript_id) # Transcript length and GC content transcript_lengths = {} transcript_gc_contents = {} for transcript_id, transcript in transcripts.iteritems(): transcript_lengths[transcript_id] = sum( [interval.length for interval in transcript.intervals]) if fasta_parser is not None: transcript_gc_contents[ transcript_id] = fasta_parser.get_transcript_gc_content( transcript) # Gene length, GC content and start + end positions genes = [] for gene, transcript_ids in gene_to_transcripts.iteritems(): length = np.median([ transcript_lengths[transcript_id] for transcript_id in transcript_ids ]) gc_content = np.median([ transcript_gc_contents[transcript_id] for transcript_id in transcript_ids ]) transcript_intervals = [] for transcript_id in transcript_ids: transcript_intervals += transcripts[transcript_id].intervals transcript_intervals.sort(key=lambda interval: interval.chrom) intervals = [] for chrom, chrom_intervals_iter in itertools.groupby( transcript_intervals, lambda interval: interval.chrom): chrom_intervals = list(chrom_intervals_iter) start = min([interval.start for interval in chrom_intervals]) end = max([interval.end for interval in chrom_intervals]) interval = cr_constants.Interval(chrom, start, end, end - start, None) intervals.append(interval) gene = cr_constants.Gene(gene.id, gene.name, length, gc_content, intervals) genes.append(gene) for transcript_id in transcript_ids: transcripts[transcript_id] = cr_constants.Transcript( gene, transcript_lengths[transcript_id], transcript_gc_contents[transcript_id], transcripts[transcript_id].intervals) return transcripts, genes
def select_genes(self, gene_indices): new_genes = [cr_constants.Gene(gene[0], gene[1], None, None, None) for \ gene in np.array(self.genes)[gene_indices]] new_mat = GeneBCMatrix(new_genes, list(self.bcs)) new_mat.m = self.m[gene_indices,:] return new_mat