def __init__(self, ctx): super(type(self), self).__init__(ctx) self.__get_configuration(ctx) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx) self.aligner = BwaAligner() self.aligner.event_monitor = self.event_monitor self.aligner.qformat = self.format self.aligner.max_isize = self.max_isize self.aligner.nthreads = self.nthreads self.aligner.trim_qual = self.trim_qual self.aligner.mmap_enabled = True ######## assemble hit processor chain chain = FilterLink(self.event_monitor) chain.remove_unmapped = self.remove_unmapped chain.min_hit_quality = self.min_hit_quality if self.__map_only: chain.set_next(EmitSamLink(ctx, self.event_monitor)) else: chain.set_next(MarkDuplicatesEmitter(ctx, self.event_monitor)) self.aligner.hit_visitor = chain ######## set the path to the reference index self.ref_archive = utils.get_ref_archive(ctx.getJobConf()) self.aligner.reference = self.get_reference_root(self.ref_archive) # part of the code is a workaround for accumulating records, see #331 isplit = InputSplit(ctx.getInputSplit()) self.split_end = isplit.offset + isplit.length
def run_bwa_py_sampe(refseq_fname, read_fname, mate_fname, log_level=logging.INFO, pairing_batch_size=None, seq_list_len=None, fastq_subfmt="fastq-illumina"): logger = logging.getLogger("PY") logger.setLevel(log_level) logger.info("RUNNING PYTHON VERSION") def debug_dump(seq, state): logger.debug("%s: name=%s" % (state, seq.get_name())) logger.debug("%s: qual=%s" % (state, seq.get_qual_string())) logger.debug("%s: strand=%d" % (state, seq.strand)) logger.debug("%s: pos=%d" % (state, seq.pos)) logger.debug("%s: mapQ=%d" % (state, seq.mapQ)) read_flow = Bio.SeqIO.parse(open(read_fname), fastq_subfmt) mate_flow = Bio.SeqIO.parse(open(mate_fname), fastq_subfmt) pairs_flow = it.izip(read_flow, mate_flow) class ResultCollector(object): def __init__(self): self.result = [] def process(self, pair): self.result.append(pair[0]) self.result.append(pair[1]) result = ResultCollector() while 1: start = time.time() pairs = list(it.islice(pairs_flow, 0, seq_list_len)) if len(pairs) == 0: break # turn the biopython SeqRecords into simple tuples tuples = map(lambda t: (t[0].name, t[0].seq.tostring(), None, t[1].seq.tostring(), None), pairs[0:5]) for t in tuples: print t logger.info('reading seqs %f sec' % (time.time() - start)) start = time.time() aligner = BwaAligner() aligner.reference = refseq_fname aligner.hit_visitor = result for t in tuples[0:5]: aligner.load_pair_record(t) aligner.run_alignment() aligner.clear_batch() logger.info('alignment %f sec' % (time.time() - start)) # map bwa mappings to dictionaries def bwam_to_hash(bwa_m): h = dict( name=bwa_m.name, aux=bwa_m.tags, seq=bwa_m.get_seq_5() ) return h return map(bwam_to_hash, result.result)
def setUp(self): utils.build_ref_index() self.aligner = BwaAligner() self.aligner.reference = utils.reference self.aligner.hit_visitor = type(self).SimpleVisitor() self.pairs = [] with open(utils.get_fixture_path("pairs.txt")) as f: for line in f: if not line.startswith("#"): # leave #-lines for comments self.pairs.append(line.rstrip("\r\n").split("\t"))
def __init__(self, ctx): super(type(self), self).__init__(ctx) self.__get_configuration(ctx) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx) self.aligner = BwaAligner() self.aligner.event_monitor = self.event_monitor self.aligner.qformat = self.format self.aligner.max_isize = self.max_isize self.aligner.nthreads = self.nthreads self.aligner.trim_qual = self.trim_qual self.aligner.mmap_enabled = True ######## assemble hit processor chain chain = FilterLink(self.event_monitor) chain.remove_unmapped = self.remove_unmapped chain.min_hit_quality = self.min_hit_quality if self.__map_only: chain.set_next( EmitSamLink(ctx, self.event_monitor) ) else: chain.set_next( MarkDuplicatesEmitter(ctx, self.event_monitor) ) self.aligner.hit_visitor = chain ######## set the path to the reference index self.ref_archive = utils.get_ref_archive(ctx.getJobConf()) self.aligner.reference = self.get_reference_root(self.ref_archive) # part of the code is a workaround for accumulating records, see #331 isplit = InputSplit(ctx.getInputSplit()) self.split_end = isplit.offset + isplit.length
def setUp(self): self.aligner = BwaAligner() test_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', '..')) self.aligner.reference = os.path.join(test_dir, 'seal', 'mini_ref_fixture', 'mini_ref.fasta') self.aligner.hit_visitor = MappingsCollector() self.aligner.qformat = "fastq-sanger" self.pair = ( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904", "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG", "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################", "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT", "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>" )
class TestBwaAligner(unittest.TestCase): def setUp(self): self.aligner = BwaAligner() test_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) self.aligner.reference = os.path.join(test_dir, 'seal', 'mini_ref_fixture', 'mini_ref.fasta') self.aligner.hit_visitor = MappingsCollector() self.aligner.qformat = "fastq-sanger" self.pair = ( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904", "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG", "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################", "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT", "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>") def test_pair(self): self.aligner.load_pair_record(self.pair) self.aligner.run_alignment() self.aligner.clear_batch() results = sorted(self.aligner.hit_visitor.mappings) self.assertEqual( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904 133 chr1 24762 0 * = 24762 0 AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT @@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>", results[0]) self.assertEqual( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904 73 chr1 24762 37 101M = 24762 0 GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG ?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A################# XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:7T83G9", results[1])
class TestBwaAligner(unittest.TestCase): def setUp(self): self.aligner = BwaAligner() test_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', '..')) self.aligner.reference = os.path.join(test_dir, 'seal', 'mini_ref_fixture', 'mini_ref.fasta') self.aligner.hit_visitor = MappingsCollector() self.aligner.qformat = "fastq-sanger" self.pair = ( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904", "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG", "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################", "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT", "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>" ) def test_pair(self): self.aligner.load_pair_record(self.pair) self.aligner.run_alignment() self.aligner.clear_batch() results = sorted(self.aligner.hit_visitor.mappings) self.assertEqual( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904 133 chr1 24762 0 * = 24762 0 AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT @@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>", results[0]) self.assertEqual( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904 73 chr1 24762 37 101M = 24762 0 GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG ?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A################# XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:7T83G9", results[1])
def setUp(self): self.aligner = BwaAligner() test_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) self.aligner.reference = os.path.join(test_dir, 'seal', 'mini_ref_fixture', 'mini_ref.fasta') self.aligner.hit_visitor = MappingsCollector() self.aligner.qformat = "fastq-sanger" self.pair = ( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904", "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG", "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################", "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT", "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>")
class mapper(Mapper): """ Aligns sequences to a reference genome. @input-record: C{key} does not matter (standard LineRecordReader); C{value} is a tab-separated text line with 5 fields: ID, read_seq, read_qual, mate_seq, mate_qual. @output-record: protobuf-serialized mapped pairs (map-reduce job) or alignment records in SAM format (map-only job). @jobconf-param: C{mapred.reduce.tasks} number of Hadoop reduce tasks to launch. If the value of this property is set to 0, then the mapper will directly output the mappings in SAM format, like BWA. If set to a value > 0 the mapper will output mappings in the protobuf serialized format for the rmdup reducer. @jobconf-param: C{seal.seqal.log.level} logging level, specified as a logging module literal. @jobconf-param: C{mapred.cache.archives} distributed cache entry for the bwa index archive. The entry is of the form HDFS_PATH#LINK_NAME. The archive for a given chromosome must contain (at the top level, i.e., no directories) all files generated by 'bwa index' for that chromosome. @jobconf-param: C{seal.seqal.alignment.max.isize}: if the inferred isize is greater than this value, Smith-Waterman alignment for unmapped reads will be skipped. @jobconf-param: C{seal.seqal.pairing.batch.size}: how many sequences should be processed at a time by the pairing function. Status will be updated at each new batch: therefore, lowering this value can help avoid timeouts. @jobconf-param: C{seal.seqal.fastq-subformat} Specifies base quality score encoding. Supported types are: 'fastq-sanger' and 'fastq-illumina'. @jobconf-param: C{mapred.create.symlink} must be set to 'yes'. @jobconf-param: C{seal.seqal.min_hit_quality} mapping quality threshold below which the mapping will be discarded. """ SUPPORTED_FORMATS = "fastq-illumina", "fastq-sanger" DEFAULT_FORMAT = "fastq-sanger" COUNTER_CLASS = "SEQAL" DeprecationMap = { "seal.seqal.log.level": "bl.seqal.log.level", "seal.seqal.alignment.max.isize": "bl.seqal.alignment.max.isize", "seal.seqal.pairing.batch.size": "bl.seqal.pairing.batch.size", "seal.seqal.fastq-subformat": "bl.seqal.fastq-subformat", "seal.seqal.min_hit_quality": "bl.seqal.min_hit_quality", "seal.seqal.remove_unmapped": "bl.seqal.remove_unmapped", "seal.seqal.discard_duplicates": "bl.seqal.discard_duplicates", "seal.seqal.nthreads": "bl.seqal.nthreads", "seal.seqal.trim.qual": "bl.seqal.trim.qual", } def __get_configuration(self, ctx): # TODO: refactor settings common to mapper and reducer jc = ctx.getJobConf() logger = logging.getLogger("seqal") jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT) jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000) jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000) jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0) jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False) jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1) jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0) try: self.log_level = getattr(logging, self.log_level) except AttributeError: raise ValueError("Unsupported log level: %r" % self.log_level) if self.format not in self.SUPPORTED_FORMATS: raise_pydoop_exception( "seal.seqal.fastq-subformat must be one of %r" % (self.SUPPORTED_FORMATS,) ) if self.max_isize <= 0: raise ValueError("'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]") if self.batch_size <= 0: raise ValueError("'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]") # minimum qual value required for a hit to be kept. By default outputs all the # hits BWA returns. if self.min_hit_quality < 0: raise ValueError("'seal.seqal.min_hit_quality' must be >= 0, if specified [0]") # number of concurrent threads for main alignment operation if self.nthreads <= 0: raise ValueError("'seal.seqal.nthreads' must be > 0, if specified [1]") # trim quality parameter used by BWA from read trimming. Equivalent to # the -q parameter for bwa align if self.trim_qual < 0: raise ValueError("'seal.seqal.trim.qual' must be >= 0, if specified [0]") if jc.hasKey('mapred.reduce.tasks') and jc.getInt('mapred.reduce.tasks') > 0: self.__map_only = False else: self.__map_only = True def __is_last_record(self, k, v): return k + len(v) + 2 >= self.split_end def get_reference_root(self, ref_dir): """ Given a directory containing a BWA indexed reference, such that all its files have a common name (except the extension), this method find the path to the reference including the common name. e.g. my_reference/hg_18.bwt my_reference/hg_18.rsax my_reference/hg_18.sax => "my_references/hg_18" my_reference/hg_18.pac my_reference/irrelevant_file """ index_paths = filter(lambda tpl: tpl[1].lstrip('.') in BWA_INDEX_EXT, map(os.path.splitext, os.listdir(ref_dir))) roots = set( zip(*index_paths)[0] ) if len(roots) == 0: raise(ValueError, "Missing references. Didn't find any files with required extensions (%s) at path %s" % (BWA_INDEX_EXT, ref_dir)) elif len(roots) != 1: raise(ValueError, "multiple references? Found reference roots %s" % (roots,)) return os.path.join(ref_dir, tuple(roots)[0]) def __init__(self, ctx): super(type(self), self).__init__(ctx) self.__get_configuration(ctx) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx) self.aligner = BwaAligner() self.aligner.event_monitor = self.event_monitor self.aligner.qformat = self.format self.aligner.max_isize = self.max_isize self.aligner.nthreads = self.nthreads self.aligner.trim_qual = self.trim_qual self.aligner.mmap_enabled = True ######## assemble hit processor chain chain = FilterLink(self.event_monitor) chain.remove_unmapped = self.remove_unmapped chain.min_hit_quality = self.min_hit_quality if self.__map_only: chain.set_next( EmitSamLink(ctx, self.event_monitor) ) else: chain.set_next( MarkDuplicatesEmitter(ctx, self.event_monitor) ) self.aligner.hit_visitor = chain ######## set the path to the reference index self.ref_archive = utils.get_ref_archive(ctx.getJobConf()) self.aligner.reference = self.get_reference_root(self.ref_archive) # part of the code is a workaround for accumulating records, see #331 isplit = InputSplit(ctx.getInputSplit()) self.split_end = isplit.offset + isplit.length def map(self, ctx): # Accumulates reads in self.pairs, until batch size is reached or # until the input is finished. At that point it calls run_alignment # and emits the output. k = struct.unpack(">q", ctx.getInputKey())[0] v = ctx.getInputValue() self.aligner.load_pair_record(v.split("\t")) is_last_record = self.__is_last_record(k, v) if self.aligner.get_batch_size() >= self.batch_size or is_last_record: self.aligner.run_alignment() self.aligner.clear_batch() if is_last_record: self.aligner.release_resources()
class mapper(Mapper): """ Aligns sequences to a reference genome. @input-record: C{key} does not matter (standard LineRecordReader); C{value} is a tab-separated text line with 5 fields: ID, read_seq, read_qual, mate_seq, mate_qual. @output-record: protobuf-serialized mapped pairs (map-reduce job) or alignment records in SAM format (map-only job). @jobconf-param: C{mapred.reduce.tasks} number of Hadoop reduce tasks to launch. If the value of this property is set to 0, then the mapper will directly output the mappings in SAM format, like BWA. If set to a value > 0 the mapper will output mappings in the protobuf serialized format for the rmdup reducer. @jobconf-param: C{seal.seqal.log.level} logging level, specified as a logging module literal. @jobconf-param: C{mapred.cache.archives} distributed cache entry for the bwa index archive. The entry is of the form HDFS_PATH#LINK_NAME. The archive for a given chromosome must contain (at the top level, i.e., no directories) all files generated by 'bwa index' for that chromosome. @jobconf-param: C{seal.seqal.alignment.max.isize}: if the inferred isize is greater than this value, Smith-Waterman alignment for unmapped reads will be skipped. @jobconf-param: C{seal.seqal.pairing.batch.size}: how many sequences should be processed at a time by the pairing function. Status will be updated at each new batch: therefore, lowering this value can help avoid timeouts. @jobconf-param: C{seal.seqal.fastq-subformat} Specifies base quality score encoding. Supported types are: 'fastq-sanger' and 'fastq-illumina'. @jobconf-param: C{mapred.create.symlink} must be set to 'yes'. @jobconf-param: C{seal.seqal.min_hit_quality} mapping quality threshold below which the mapping will be discarded. """ SUPPORTED_FORMATS = "fastq-illumina", "fastq-sanger" DEFAULT_FORMAT = "fastq-sanger" COUNTER_CLASS = "SEQAL" DeprecationMap = { "seal.seqal.log.level": "bl.seqal.log.level", "seal.seqal.alignment.max.isize": "bl.seqal.alignment.max.isize", "seal.seqal.pairing.batch.size": "bl.seqal.pairing.batch.size", "seal.seqal.fastq-subformat": "bl.seqal.fastq-subformat", "seal.seqal.min_hit_quality": "bl.seqal.min_hit_quality", "seal.seqal.remove_unmapped": "bl.seqal.remove_unmapped", "seal.seqal.discard_duplicates": "bl.seqal.discard_duplicates", "seal.seqal.nthreads": "bl.seqal.nthreads", "seal.seqal.trim.qual": "bl.seqal.trim.qual", } def __get_configuration(self, ctx): # TODO: refactor settings common to mapper and reducer jc = ctx.getJobConf() logger = logging.getLogger("seqal") jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT) jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000) jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000) jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0) jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False) jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1) jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0) try: self.log_level = getattr(logging, self.log_level) except AttributeError: raise ValueError("Unsupported log level: %r" % self.log_level) if self.format not in self.SUPPORTED_FORMATS: raise_pydoop_exception( "seal.seqal.fastq-subformat must be one of %r" % (self.SUPPORTED_FORMATS, )) if self.max_isize <= 0: raise ValueError( "'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]" ) if self.batch_size <= 0: raise ValueError( "'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]" ) # minimum qual value required for a hit to be kept. By default outputs all the # hits BWA returns. if self.min_hit_quality < 0: raise ValueError( "'seal.seqal.min_hit_quality' must be >= 0, if specified [0]") # number of concurrent threads for main alignment operation if self.nthreads <= 0: raise ValueError( "'seal.seqal.nthreads' must be > 0, if specified [1]") # trim quality parameter used by BWA from read trimming. Equivalent to # the -q parameter for bwa align if self.trim_qual < 0: raise ValueError( "'seal.seqal.trim.qual' must be >= 0, if specified [0]") if jc.hasKey('mapred.reduce.tasks' ) and jc.getInt('mapred.reduce.tasks') > 0: self.__map_only = False else: self.__map_only = True def __is_last_record(self, k, v): return k + len(v) + 2 >= self.split_end def get_reference_root(self, ref_dir): """ Given a directory containing a BWA indexed reference, such that all its files have a common name (except the extension), this method find the path to the reference including the common name. e.g. my_reference/hg_18.bwt my_reference/hg_18.rsax my_reference/hg_18.sax => "my_references/hg_18" my_reference/hg_18.pac my_reference/irrelevant_file """ index_paths = filter(lambda tpl: tpl[1].lstrip('.') in BWA_INDEX_EXT, map(os.path.splitext, os.listdir(ref_dir))) roots = set(zip(*index_paths)[0]) if len(roots) == 0: raise ( ValueError, "Missing references. Didn't find any files with required extensions (%s) at path %s" % (BWA_INDEX_EXT, ref_dir)) elif len(roots) != 1: raise (ValueError, "multiple references? Found reference roots %s" % (roots, )) return os.path.join(ref_dir, tuple(roots)[0]) def __init__(self, ctx): super(type(self), self).__init__(ctx) self.__get_configuration(ctx) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx) self.aligner = BwaAligner() self.aligner.event_monitor = self.event_monitor self.aligner.qformat = self.format self.aligner.max_isize = self.max_isize self.aligner.nthreads = self.nthreads self.aligner.trim_qual = self.trim_qual self.aligner.mmap_enabled = True ######## assemble hit processor chain chain = FilterLink(self.event_monitor) chain.remove_unmapped = self.remove_unmapped chain.min_hit_quality = self.min_hit_quality if self.__map_only: chain.set_next(EmitSamLink(ctx, self.event_monitor)) else: chain.set_next(MarkDuplicatesEmitter(ctx, self.event_monitor)) self.aligner.hit_visitor = chain ######## set the path to the reference index self.ref_archive = utils.get_ref_archive(ctx.getJobConf()) self.aligner.reference = self.get_reference_root(self.ref_archive) # part of the code is a workaround for accumulating records, see #331 isplit = InputSplit(ctx.getInputSplit()) self.split_end = isplit.offset + isplit.length def map(self, ctx): # Accumulates reads in self.pairs, until batch size is reached or # until the input is finished. At that point it calls run_alignment # and emits the output. k = struct.unpack(">q", ctx.getInputKey())[0] v = ctx.getInputValue() self.aligner.load_pair_record(v.split("\t")) is_last_record = self.__is_last_record(k, v) if self.aligner.get_batch_size() >= self.batch_size or is_last_record: self.aligner.run_alignment() self.aligner.clear_batch() if is_last_record: self.aligner.release_resources()
def setUp(self): self.aligner = BwaAligner() test_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) self.aligner.reference = os.path.join(test_dir, 'seal', 'mini_ref_fixture', 'mini_ref.fasta') self.aligner.hit_visitor = MappingsCollector() self.aligner.qformat = "fastq-sanger"
class TestBwaAligner(unittest.TestCase): def setUp(self): self.aligner = BwaAligner() test_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) self.aligner.reference = os.path.join(test_dir, 'seal', 'mini_ref_fixture', 'mini_ref.fasta') self.aligner.hit_visitor = MappingsCollector() self.aligner.qformat = "fastq-sanger" def test_pair(self): pair = ( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904", "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG", "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################", "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT", "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>") results = self._align_pair(pair) self.assertEqual( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904 133 chr1 24762 0 * = 24762 0 AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT @@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>", results[0]) self.assertEqual( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904 73 chr1 24762 37 101M = 24762 0 GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG ?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A################# XT:A:U NM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:7T83G9", results[1]) def test_easy_align(self): pair = ( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904", # pos: 361 "TAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC", "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAE", # pos: 541 "AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCC", "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAF") results = self._align_pair(pair) self.assertEqual( ["HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904", "129","chr1","541","37","60M","=","361","-180", "AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCC", "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAF"], self._get_sam_fields(results[0])) self.assertEqual( ["HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904", "65","chr1","361","37","60M","=","541","180", "TAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC", "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAE"], self._get_sam_fields(results[1])) def test_align_pair_of_rev_complements(self): # These are the same reads as the above, but reversed # and complemented. Remember that the above were taken # directly from the mini reference sequence. pair = ( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904", # pos: 361 "GTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGGTTAGGGTTA", "EAEHEEEEAGIIGHFEGC?B?HGIGEGCHGEGEEGFAAGEFE?BF@HFCFDAA=<?B@@?", # pos: 541 "GGCGGAGTTGCGTTCTCCTCAGCACAGACCCGGAGAGCACCGCGAGGGCGGAGCTGCGTT", "FAF@ECFCGFGIIGGCGECIHEBC:B?HEFDHGEIHI<BEEFIHEHE<DDHBDDDFD@@@") results = self._align_pair(pair) self.assertEqual( ['HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904', '113','chr1','361','37','60M','=','541','180', 'TAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC', '?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAE'], self._get_sam_fields(results[0]) ) self.assertEqual( ['HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904', '177','chr1','541','37','60M','=','361','-180', 'AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCC', '@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAF'], self._get_sam_fields(results[1]) ) def _align_pair(self, pair): self.aligner.load_pair_record(pair) self.aligner.run_alignment() self.aligner.clear_batch() return sorted(self.aligner.hit_visitor.mappings) @staticmethod def _get_sam_fields(sam_record): return sam_record.split('\t')[0:11]
class TestBwaAligner(unittest.TestCase): REFERENCE = os.path.join(os.path.dirname(__file__), 'fixtures/foobar.fa') BUILD_REFERENCE = True class SimpleVisitor(object): def __init__(self): self.sam = SamFormatter() self.output = StringIO.StringIO() def process(self, pair): for hit in pair: print >>self.output, self.sam.format(hit) def setUp(self): utils.build_ref_index() self.aligner = BwaAligner() self.aligner.reference = utils.reference self.aligner.hit_visitor = type(self).SimpleVisitor() self.pairs = [] with open(utils.get_fixture_path("pairs.txt")) as f: for line in f: if not line.startswith("#"): # leave #-lines for comments self.pairs.append(line.rstrip("\r\n").split("\t")) def tearDown(self): utils.remove_ref_index() def test_load_clear_batch(self): for row in self.pairs: self.aligner.load_pair_record(row) self.assertEqual(len(self.pairs), self.aligner.get_batch_size()) self.aligner.clear_batch() self.assertEqual(0, self.aligner.get_batch_size()) def test_defaults(self): self.assertEqual("fastq-illumina", self.aligner.qformat) self.assertEqual(1000, self.aligner.max_isize) self.assertEqual(1, self.aligner.nthreads) self.assertEqual(0, self.aligner.trim_qual) def test_alignment(self): for row in self.pairs: self.aligner.load_pair_record(row) self.aligner.run_alignment() # TODO: write a more useful test, but for that we'll need a complete test fixture self.assertTrue( len(self.aligner.hit_visitor.output.getvalue()) > 0 ) def test_alignment_mmap(self): self.aligner.mmap_enabled = True # Generate the .sax and .rsax indices. They will be removed by tearDown. bwa.make_suffix_arrays_for_mmap(utils.reference) for row in self.pairs: self.aligner.load_pair_record(row) self.aligner.run_alignment() # TODO: write a more useful test, but for that we'll need a complete test fixture self.assertTrue( len(self.aligner.hit_visitor.output.getvalue()) > 0 ) def test_missing_mmap_index(self): self.aligner.mmap_enabled = True for row in self.pairs: self.aligner.load_pair_record(row) self.assertRaises(ValueError, self.aligner.run_alignment)
class TestBwaAligner(unittest.TestCase): REFERENCE = os.path.join(os.path.dirname(__file__), 'fixtures/foobar.fa') BUILD_REFERENCE = True class SimpleVisitor(object): def __init__(self): self.sam = SamFormatter() self.output = StringIO.StringIO() def process(self, pair): for hit in pair: print >> self.output, self.sam.format(hit) def setUp(self): utils.build_ref_index() self.aligner = BwaAligner() self.aligner.reference = utils.reference self.aligner.hit_visitor = type(self).SimpleVisitor() self.pairs = [] with open(utils.get_fixture_path("pairs.txt")) as f: for line in f: if not line.startswith("#"): # leave #-lines for comments self.pairs.append(line.rstrip("\r\n").split("\t")) def tearDown(self): utils.remove_ref_index() def test_load_clear_batch(self): for row in self.pairs: self.aligner.load_pair_record(row) self.assertEqual(len(self.pairs), self.aligner.get_batch_size()) self.aligner.clear_batch() self.assertEqual(0, self.aligner.get_batch_size()) def test_defaults(self): self.assertEqual("fastq-illumina", self.aligner.qformat) self.assertEqual(1000, self.aligner.max_isize) self.assertEqual(1, self.aligner.nthreads) self.assertEqual(0, self.aligner.trim_qual) def test_alignment(self): for row in self.pairs: self.aligner.load_pair_record(row) self.aligner.run_alignment() # TODO: write a more useful test, but for that we'll need a complete test fixture self.assertTrue(len(self.aligner.hit_visitor.output.getvalue()) > 0) def test_alignment_mmap(self): self.aligner.mmap_enabled = True # Generate the .sax and .rsax indices. They will be removed by tearDown. bwa.make_suffix_arrays_for_mmap(utils.reference) for row in self.pairs: self.aligner.load_pair_record(row) self.aligner.run_alignment() # TODO: write a more useful test, but for that we'll need a complete test fixture self.assertTrue(len(self.aligner.hit_visitor.output.getvalue()) > 0) def test_missing_mmap_index(self): self.aligner.mmap_enabled = True for row in self.pairs: self.aligner.load_pair_record(row) self.assertRaises(ValueError, self.aligner.run_alignment)