def test_job_conf_getters(self): values = ['int', '1', 'float', '2.3', 'bool', 'false'] conv_values = [1, 2.3, False] jc = JobConf(values) for i, k in enumerate(values[::2]): getter = getattr(jc, 'get_%s' % k) self.assertEqual(getter(k), conv_values[i]) for jc in JobConf([]), JobConf(['x', 'foo']): for d in False, True: self.assertEqual(jc.get_bool('x', default=d), d) self.assertRaises(RuntimeError, JobConf(['x', 'foo']).get_bool, 'x')
def setUp(self): self._map_batch_size = 6 self._log = logging.getLogger(__name__) self._old_cwd = os.getcwd() self._jc = JobConf([ 'seal.seqal.log.level', 'DEBUG', 'seal.seqal.fastq-subformat', 'fastq-sanger', #'seal.seqal.alignment.max.isize', None, #'seal.seqal.alignment.min.isize', None, 'seal.seqal.pairing.batch.size', self._map_batch_size, 'seal.seqal.min_hit_quality', 0, 'seal.seqal.remove_unmapped', False, 'seal.seqal.nthreads', 1, 'seal.seqal.trim.qual', 0, 'mapred.reduce.tasks', 0, 'mapred.cache.archives', (os.path.join("file://", tseal_utils.MiniRefMemDir, "mini_ref_bwamem_0.7.8.tar") + "#reference"), ]) self._things_to_clean_up = [] workdir = tempfile.mkdtemp("seqal_mapper_test_workdir") self._things_to_clean_up.append(workdir) os.chdir(workdir) self._ctx = sam_map_context(self._jc, []) try: self._setup_ref(self._jc.get('mapred.cache.archives')) self._mapper = mapper(self._ctx) except StandardError: # call tearDown ourselves because unittest doesn't call it if setUp fails self.tearDown() raise
def test_job_conf(self): job_conf = {} for k in mrv1_to_mrv2: job_conf[k] = k jc = JobConf( [item for sublist in job_conf.iteritems() for item in sublist]) for k in mrv2_to_mrv1: self.assertEqual(jc[k], job_conf[mrv2_to_mrv1[k]])
class TestSeqalMapper(unittest.TestCase): def setUp(self): self._map_batch_size = 6 self._log = logging.getLogger(__name__) self._old_cwd = os.getcwd() self._jc = JobConf([ 'seal.seqal.log.level', 'DEBUG', 'seal.seqal.fastq-subformat', 'fastq-sanger', #'seal.seqal.alignment.max.isize', None, #'seal.seqal.alignment.min.isize', None, 'seal.seqal.pairing.batch.size', self._map_batch_size, 'seal.seqal.min_hit_quality', 0, 'seal.seqal.remove_unmapped', False, 'seal.seqal.nthreads', 1, 'seal.seqal.trim.qual', 0, 'mapred.reduce.tasks', 0, 'mapred.cache.archives', (os.path.join("file://", tseal_utils.MiniRefMemDir, "mini_ref_bwamem_0.7.8.tar") + "#reference"), ]) self._things_to_clean_up = [] workdir = tempfile.mkdtemp("seqal_mapper_test_workdir") self._things_to_clean_up.append(workdir) os.chdir(workdir) self._ctx = sam_map_context(self._jc, []) try: self._setup_ref(self._jc.get('mapred.cache.archives')) self._mapper = mapper(self._ctx) except StandardError: # call tearDown ourselves because unittest doesn't call it if setUp fails self.tearDown() raise def tearDown(self): try: for item in self._things_to_clean_up: try: shutil.rmtree(item) except StandardError as e: self._log.info("Failed to remove %s", item) self._log.info("Error: %s", e) finally: os.chdir(self._old_cwd) def _setup_ref(self, mr_cache_archives): self._log.info("Setting up reference using property value '%s'", mr_cache_archives) archive, link = mr_cache_archives.split('#') ar = tarfile.TarFile(archive) ar.extractall('.') os.symlink('.', link) self._log.info("Here is the listing of the extraction directory: %s", ', '.join(os.listdir('.'))) def test_simple_map(self): # get input data and expected output. We keep exactly one map batch of reads # (as per self._map_batch_size) reads = tseal_utils.get_mini_ref_seqs()[0:(self._map_batch_size / 2)] expected_output = sorted(tseal_utils.rapi_mini_ref_seqs_sam_no_header().split('\n')[0:2*len(reads)]) self._log.info("loaded %s fragments and %s lines of expected output", len(reads), len(expected_output)) if len(reads) * 2 < self._mapper.batch_size: self.fail("batch size for test (%s) is set larger than the number of available " "reads (%s). Aligner won't run" % (self._mapper.batch_size, len(reads) * 2)) for idx, fragment in enumerate(reads): self._ctx.set_input_key(idx * 100) self._ctx.set_input_value('\t'.join(fragment)) self._mapper.map(self._ctx) produced_sam = sorted(self._ctx.sam_lines) self.assertEquals(len(expected_output), len(produced_sam)) self.assertEquals(expected_output, produced_sam) self.assertEquals(len(reads) * 2, self._ctx.counters["SEQAL:EMITTED SAM RECORDS"])
def test_missing_key(self): jc = JobConf(((1, 2), (3, 4))) self.assertRaises(RuntimeError, jc.get, 'no_key')
def test_missing_key(self): jc = JobConf(('a', 'b', 'c', 'd')) self.assertRaises(RuntimeError, jc.get, 'no_key')
class TestSeqalMapper(unittest.TestCase): def setUp(self): self._map_batch_size = 6 self._log = logging.getLogger(__name__) self._old_cwd = os.getcwd() self._jc = JobConf([ 'seal.seqal.log.level', 'DEBUG', 'seal.seqal.fastq-subformat', 'fastq-sanger', #'seal.seqal.alignment.max.isize', None, #'seal.seqal.alignment.min.isize', None, 'seal.seqal.pairing.batch.size', self._map_batch_size, 'seal.seqal.min_hit_quality', 0, 'seal.seqal.remove_unmapped', False, 'seal.seqal.nthreads', 1, 'seal.seqal.trim.qual', 0, 'mapred.reduce.tasks', 0, 'mapred.cache.archives', (os.path.join("file://", tseal_utils.MiniRefMemDir, "mini_ref_bwamem_0.7.8.tar") + "#reference"), ]) self._things_to_clean_up = [] workdir = tempfile.mkdtemp("seqal_mapper_test_workdir") self._things_to_clean_up.append(workdir) os.chdir(workdir) self._ctx = sam_map_context(self._jc, []) try: self._setup_ref(self._jc.get('mapred.cache.archives')) self._mapper = mapper(self._ctx) except StandardError: # call tearDown ourselves because unittest doesn't call it if setUp fails self.tearDown() raise def tearDown(self): try: for item in self._things_to_clean_up: try: shutil.rmtree(item) except StandardError as e: self._log.info("Failed to remove %s", item) self._log.info("Error: %s", e) finally: os.chdir(self._old_cwd) def _setup_ref(self, mr_cache_archives): self._log.info("Setting up reference using property value '%s'", mr_cache_archives) archive, link = mr_cache_archives.split('#') ar = tarfile.TarFile(archive) ar.extractall('.') os.symlink('.', link) self._log.info("Here is the listing of the extraction directory: %s", ', '.join(os.listdir('.'))) def test_simple_map(self): # get input data and expected output. We keep exactly one map batch of reads # (as per self._map_batch_size) reads = tseal_utils.get_mini_ref_seqs()[0:(self._map_batch_size / 2)] expected_output = sorted( tseal_utils.rapi_mini_ref_seqs_sam_no_header().split( '\n')[0:2 * len(reads)]) self._log.info("loaded %s fragments and %s lines of expected output", len(reads), len(expected_output)) if len(reads) * 2 < self._mapper.batch_size: self.fail( "batch size for test (%s) is set larger than the number of available " "reads (%s). Aligner won't run" % (self._mapper.batch_size, len(reads) * 2)) for idx, fragment in enumerate(reads): self._ctx.set_input_key(idx * 100) self._ctx.set_input_value('\t'.join(fragment)) self._mapper.map(self._ctx) produced_sam = sorted(self._ctx.sam_lines) self.assertEquals(len(expected_output), len(produced_sam)) self.assertEquals(expected_output, produced_sam) self.assertEquals( len(reads) * 2, self._ctx.counters["SEQAL:EMITTED SAM RECORDS"])
def setUp(self): self.__jc = JobConf([]) self.__ctx = reduce_context(self.__jc, []) self.__reducer = reducer(self.__ctx) self.__reducer.discard_duplicates = True self.__clean_reducer = reducer(self.__ctx) # unmodified