def find_subread_entry(cmph5_alignment, subread_dict): """Find the entry in subread_dict that corresponds to the cmph5_alignment. Match the movie name and hole number, then find which of the bax.h5 subread bounds overlap with the bounds of the alignment. Args: cmph5_alignment: a CmpH5IO.CmpH5Alignment subread_dict: dictionary from affixes.subread_dictionary that will be searched for a match with the cmph5_alignment Returns: key, overlapping_bounds: the (movie_name, hole_number) key and the subread bounds that contain the alignment Returns None if the alignment is not found in the subread_dict """ key = (cmph5_alignment.movieInfo.Name, cmph5_alignment.HoleNumber) if key not in subread_dict: # If this alignment isn't in the input fofn, just skip it return None, None # Figure out which subread this alignment is from by checking for # overlap with the bounds from the region table overlapping_bounds = None for subread_bounds in subread_dict[key].iterkeys(): if BasH5IO.intersectRanges( (cmph5_alignment.rStart, cmph5_alignment.rEnd), subread_bounds) is not None: overlapping_bounds = subread_bounds break return key, overlapping_bounds
def get_chemistry_info(sam_header, input_filenames, fail_on_missing=False): """Get chemistry triple information for movies referenced in a SAM header. Args: sam_header: a pysam.Samfile.header, which is a multi-level dictionary. Movie names are read from RG tags in this header. input_filenames: a list of bas, bax, or fofn filenames. fail_on_missing: if True, raise an exception if the chemistry information for a movie in the header cannot be found. If False, just log a warning. Returns: a list of strings that can be written as DS tags to RG entries in the header of a new SAM or BAM file. For example, ['BINDINGKIT:xxxx;SEQUENCINGKIT:yyyy;SOFTWAREVERSION:2.0'] Raises: ChemistryLoadingException if chemistry information cannot be found for a movie in the header and fail_on_missing is True. """ # First get the full list of ba[sx] files, reading through any fofn or xml # inputs bas_filenames = [] for filename in input_filenames: bas_filenames.extend(FofnIO.enumeratePulseFiles(filename)) # Then get the chemistry triple for each movie in the list of bas files triple_dict = {} for bas_filename in bas_filenames: bas_file = BasH5IO.BasH5Reader(bas_filename) movie_name = bas_file.movieName chem_triple = bas_file.chemistryBarcodeTriple triple_dict[movie_name] = chem_triple # Finally, find the movie names that appear in the header and create CO # lines with the chemistry triple if 'RG' not in sam_header: return [] rgds_entries = {} for rg_entry in sam_header['RG']: rg_id = rg_entry['ID'] rg_movie_name = rg_entry[MOVIENAME_TAG] try: rg_chem_triple = triple_dict[rg_movie_name] rgds_entries[rg_id] = rg_chem_triple except KeyError: err_msg = ( "Cannot find chemistry information for movie {m}.".format( m=rg_movie_name)) if fail_on_missing: raise ChemistryLoadingException(err_msg) else: log.warning(err_msg) rgds_strings = format_rgds_entries(rgds_entries) return rgds_strings
def setUpClass(cls): base_test_case.BaseTestCase.setUpClass() bash5_filename = os.path.join( base_test_case.ROOT_DATA_DIR, "m130522_092457_42208_cTEST1_s1_p0.1.bax.h5") cls.bash5_reader = BasH5IO.BasH5Reader(bash5_filename) cls.subread_dict = affixes.subread_dictionary(cls.bash5_reader)
def setUpClass(cls): base_test_case.BaseTestCase.setUpClass() bash5_filename = os.path.join( base_test_case.ROOT_DATA_DIR, "m130522_092457_42208_cTEST1_s1_p0.1.bax.h5") cmph5_filename = os.path.join(base_test_case.ROOT_DATA_DIR, "test_alignment.cmp.h5") cls.bash5_reader = BasH5IO.BasH5Reader(bash5_filename) cls.cmph5_reader = CmpH5IO.CmpH5Reader(cmph5_filename) cls.subread_dict = affixes.subread_dictionary(cls.bash5_reader) cls.affix_bounds = affixes.affix_boundaries(cls.subread_dict, cls.cmph5_reader, 1)
def setUpClass(cls): base_test_case.BaseTestCase.setUpClass() bash5_filename = os.path.join(base_test_case.ROOT_DATA_DIR, "m130522_092457_42208_cTEST1_s1_p0.1.bax.h5") cmph5_filename = os.path.join(base_test_case.ROOT_DATA_DIR, "test_alignment.cmp.h5") cls.bash5_reader = BasH5IO.BasH5Reader(bash5_filename) cls.cmph5_reader = CmpH5IO.CmpH5Reader(cmph5_filename) cls.subread_dict = affixes.subread_dictionary(cls.bash5_reader) cls.affix_bounds = affixes.affix_boundaries(cls.subread_dict, cls.cmph5_reader, 1) cls.original_region_table = cls.bash5_reader.file['PulseData/Regions'].value cls.region_table = affixes.affix_region_table( cls.original_region_table, cls.bash5_reader.movieName, cls.affix_bounds)
def test_no_overlap_alignments(self): """Affixes cannot overlap with any aligned part of any subread.""" alignment_dict = {} for alignment in self.cmph5_reader: key = (alignment.movieInfo.Name, alignment.HoleNumber) if key not in alignment_dict: alignment_dict[key] = [] alignment_dict[key].append((alignment.rStart, alignment.rEnd)) for key in self.affix_bounds: try: alignments = alignment_dict[key] except KeyError: pass affixes = self.affix_bounds[key] for affix in affixes: for alignment in alignments: self.assertIsNone(BasH5IO.intersectRanges(affix, alignment))
def test_no_overlap_alignments(self): """Affixes cannot overlap with any aligned part of any subread.""" alignment_dict = {} for alignment in self.cmph5_reader: key = (alignment.movieInfo.Name, alignment.HoleNumber) if key not in alignment_dict: alignment_dict[key] = [] alignment_dict[key].append((alignment.rStart, alignment.rEnd)) for key in self.affix_bounds: try: alignments = alignment_dict[key] except KeyError: pass affixes = self.affix_bounds[key] for affix in affixes: for alignment in alignments: self.assertIsNone(BasH5IO.intersectRanges( affix, alignment))
def create_affix_region_tables(input_fofn_filename, cmph5_filename, output_path, min_affix_size): """Create the pbbridgemapper rgn.h5 and fofn files. Args: input_fofn_filename: fofn of bax.h5 filenames cmph5_filename: aligned_reads.cmp.h5 for the bax.h5 files output_path: path where the fofn and rgn.h5 files will be written min_affix_size: smallest affix that will be included in the region table Returns: output_fofn_filename: file name of the FOFN of pbbridgemapper rgn.h5 files """ bash5_filenames = list(FofnIO.readFofn(input_fofn_filename)) logging.info("Read filenames from input fofn file: %s", bash5_filenames) cmph5_reader = CmpH5IO.CmpH5Reader(cmph5_filename) logging.info("Opened %s", cmph5_filename) output_rgn_filenames = [] try: os.makedirs(os.path.join(output_path, 'pbbridgemapper_regions')) except OSError as e: if e.errno != errno.EEXIST: raise OSError("Could not create regions directory {d}.".format( d=os.path.join(output_path, 'pbbridgemapper_regions'))) for bash5_filename in bash5_filenames: logging.debug("Getting affix boundaries from %s", bash5_filename) bash5_reader = BasH5IO.BasH5Reader(bash5_filename) subread_dict = pbbridgemapper.affixes.subread_dictionary(bash5_reader) logging.debug("Created subread dictionary from %d ZMWs", len(subread_dict)) affix_bounds = pbbridgemapper.affixes.affix_boundaries( subread_dict, cmph5_reader, min_affix_size) logging.debug("Found %d unmapped affixes", sum([len(k) for k in affix_bounds.itervalues()])) original_region_table = ( bash5_reader.file.get('/PulseData/Regions').value) affix_region_table = pbbridgemapper.affixes.affix_region_table( original_region_table, bash5_reader.movieName, affix_bounds) output_rgn_filename = os.path.join( output_path, 'pbbridgemapper_regions', re.sub(r"ba[sx]\.h5$", "rgn.h5", os.path.basename(bash5_filename))) pbbridgemapper.affixes.write_region_table(affix_region_table, output_rgn_filename, bash5_reader) bash5_reader.close() logging.info("Wrote pbbridgemapper region table to %s", output_rgn_filename) output_rgn_filenames.append(output_rgn_filename) # Now the rgn.h5 files have been created, we just need to make the fofn output_fofn_filename = os.path.join( output_path, re.sub("fofn$", "pbbridgemapper_regions.fofn", os.path.basename(input_fofn_filename))) with open(output_fofn_filename, 'w') as output_fofn_file: for i in xrange(len(output_rgn_filenames)): filename = output_rgn_filenames[i] output_fofn_file.write(filename) if i < len(output_rgn_filenames) - 1: output_fofn_file.write('\n') logging.info("Wrote rgn file names to %s", output_fofn_filename) return output_fofn_filename
def create_pbbridgemapper_output(input_fofn_filename, affix_cmph5_filename, primary_cmph5_filename, split_reads_filename, unique_only): """Create the split_reads file the SMRTview wants. """ # First build a dictionary of all the subreads in the input fofn bash5_filenames = list(FofnIO.readFofn(input_fofn_filename)) all_subread_dict = {} for bash5_filename in bash5_filenames: bash5_reader = BasH5IO.BasH5Reader(bash5_filename) subread_dict = pbbridgemapper.affixes.subread_dictionary(bash5_reader) all_subread_dict.update(subread_dict) # Now iterate through the primary alignments, recording best scoring # primary alignments for each subread primary_cmph5_reader = CmpH5IO.CmpH5Reader(primary_cmph5_filename) for alignment in primary_cmph5_reader: key, overlapping_bounds = pbbridgemapper.affixes.find_subread_entry( alignment, all_subread_dict) if key is None: continue if all_subread_dict[key][overlapping_bounds] is not None: existing_alignment = ( all_subread_dict[key][overlapping_bounds]['primary']) else: existing_aligment = None if (existing_aligment is None or alignment.mapQV > existing_alignment['map_qv']): all_subread_dict[key][overlapping_bounds] = {} all_subread_dict[key][overlapping_bounds]['primary'] = ( pbbridgemapper.smrtview_output.alignment_to_output_dict( alignment, overlapping_bounds)) primary_cmph5_reader.close() # Now iterate through the affix alignments try: affix_cmph5_reader = CmpH5IO.CmpH5Reader(affix_cmph5_filename) except CmpH5IO.EmptyCmpH5Error: affix_cmph5_reader = [] for alignment in affix_cmph5_reader: key, overlapping_bounds = pbbridgemapper.affixes.find_subread_entry( alignment, all_subread_dict) if key is None: continue # Figure out if this is a prefix or suffix alignment if all_subread_dict[key][overlapping_bounds] is None: continue alignment_dict = ( pbbridgemapper.smrtview_output.alignment_to_output_dict( alignment, overlapping_bounds)) primary_dict = all_subread_dict[key][overlapping_bounds]['primary'] primary_subread_start = primary_dict['subread_start'] primary_subread_end = primary_dict['subread_end'] if alignment_dict['subread_end'] <= primary_subread_start: affix_type = 'prefix' elif alignment_dict['subread_start'] >= primary_subread_end: affix_type = 'suffix' else: continue if affix_type in all_subread_dict[key][overlapping_bounds]: existing_alignment = ( all_subread_dict[key][overlapping_bounds][affix_type]) else: existing_alignment = None if (existing_alignment is None or alignment.mapQV > existing_alignment['map_qv']): all_subread_dict[key][overlapping_bounds][affix_type] = \ alignment_dict if unique_only: pbbridgemapper.smrtview_output.remove_nonunique_alignments( all_subread_dict) pbbridgemapper.smrtview_output.write_split_reads_file( all_subread_dict, split_reads_filename)