def _get_reads_info(aligned_reads_file): """ Extract information from the BAM files. Returns a tuple of length 2. First item is a dictionary of dictionaries, such that holes are mapped by cell, then set. Second item is the instrument name. :param aligned_reads_file: (str) path to aligned_reads[.xml,.bam] :return tuple (reads_by_cell_then_set, instrument) (dict, string): A dictionary of dictionaries, instrument name """ inst = None reads_by_cell = defaultdict(set) with AlignmentSet(aligned_reads_file) as ds: for bamfile in ds.resourceReaders(): if ds.isIndexed: logging.info("Indexed file - will use fast loop.") for (hole, rgId) in zip(bamfile.holeNumber, bamfile.qId): movie_name = bamfile.readGroupInfo(rgId).MovieName cell = movie_to_cell(movie_name) if inst is None: inst = _cell_2_inst(cell) reads_by_cell[cell].add(hole) else: for aln in bamfile: hole = aln.HoleNumber movie_name = aln.movieName cell = movie_to_cell(movie_name) if inst is None: inst = _cell_2_inst(cell) reads_by_cell[cell].add(hole) return reads_by_cell, inst
def test_timestamped_moviename(self): moviename = "m54004_151002_00100" cellname = movie_to_cell(moviename) # XXX for the time being the cellname will be the timestamped # moviename... self.assertEqual(moviename, cellname) self.assertEqual('54004', _cell_2_inst(cellname))
def test_movie_2_cell(self): """ Parse a cell name from a movie name """ try: log.info(TestUtil.test_movie_2_cell.__doc__) self.assertEqual('m120128_025832_42129_c100277632550000001523007907041250', movie_to_cell('m120128_025832_42129_c100277632550000001523007907041250_s2_p0')) except: log.error(traceback.format_exc()) raise
def test_movie_2_cell(self): """ Parse a cell name from a movie name """ try: log.info(TestUtil.test_movie_2_cell.__doc__) self.assertEqual( 'm120128_025832_42129_c100277632550000001523007907041250', movie_to_cell( 'm120128_025832_42129_c100277632550000001523007907041250_s2_p0' )) except: log.error(traceback.format_exc()) raise
def run(dataset_file): """Reads in the input.fofn and counts movies and cells. Outputs in XML.""" with openDataSet(dataset_file) as ds: movies = None movies = set([]) for file_name in ds.toExternalFiles(): if type(ds).__name__ == "HdfSubreadSet": movies.add(path_to_movie(file_name)) else: with BamReader(file_name) as bam: for rg in bam.peer.header["RG"]: movies.add(rg["PU"]) cells = set([movie_to_cell(movie) for movie in movies]) ncells_attr = Attribute(Constants.A_NCELLS, len(cells)) nmovies_attr = Attribute(Constants.A_NMOVIES, len(movies)) attrs = [ncells_attr, nmovies_attr] report = Report(meta_rpt.id, attributes=attrs) return meta_rpt.apply_view(report)
def run(dataset_file): """Reads in the input.fofn and counts movies and cells. Outputs in XML.""" with openDataSet(dataset_file) as ds: movies = None movies = set([]) for file_name in ds.toExternalFiles(): if type(ds).__name__ == "HdfSubreadSet": movies.add( path_to_movie(file_name) ) else: with BamReader(file_name) as bam: for rg in bam.peer.header["RG"]: movies.add(rg["PU"]) cells = set([ movie_to_cell(movie) for movie in movies ]) ncells_attr = Attribute('ncells', len(cells), name="SMRT Cells") nmovies_attr = Attribute('nmovies', len(movies), name="Movies") attrs = [ncells_attr, nmovies_attr] report = Report('overview', attributes=attrs) return report
def run(dataset_file): """Reads in the input.fofn and counts movies and cells. Outputs in XML.""" with openDataSet(dataset_file) as ds: movies = None movies = set([]) for file_name in ds.toExternalFiles(): if type(ds).__name__ == "HdfSubreadSet": movies.add(path_to_movie(file_name)) else: with BamReader(file_name) as bam: for rg in bam.peer.header["RG"]: movies.add(rg["PU"]) cells = set([movie_to_cell(movie) for movie in movies]) ncells_attr = Attribute(Constants.A_NCELLS, len(cells)) nmovies_attr = Attribute(Constants.A_NMOVIES, len(movies)) attrs = [ncells_attr, nmovies_attr] report = Report(Constants.R_ID, attributes=attrs) return spec.apply_view(report)