def test_normalize_diacritics_file(self): nfd_filepath = datapath('nfd.txt') reference_nfc_filepath = datapath('nfc.txt') processed_nfc_filepath = datapath('processed_nfc.txt') self.assertTrue(preprocessing.normalize_diacritics_file(nfd_filepath, processed_nfc_filepath)) self.assertFalse(filecmp.cmp(nfd_filepath, processed_nfc_filepath)) # src & processed are different self.assertTrue(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath)) # processed matches reference
def test_strip_accents_file(self): src_filepath = datapath('src_file.txt') reference_stripped_filepath = datapath('ref_proccessed_file.txt') processed_stripped_filepath = datapath('processed_file.txt') self.assertTrue(preprocessing.strip_accents_file(src_filepath, processed_stripped_filepath)) self.assertFalse(filecmp.cmp(src_filepath, processed_stripped_filepath)) self.assertTrue(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath))
def test_second_too_short(): with temporary_path("truncated.2.fastq") as trunc2: # Create a truncated file in which the last read is missing with open(datapath('paired.2.fastq')) as f: lines = f.readlines() lines = lines[:-4] with open(trunc2, 'w') as f: f.writelines(lines) with raises(SystemExit), redirect_stderr(): execute_cli('-a XX --paired-output out.fastq'.split() + [datapath('paired.1.fastq'), trunc2])
def test_load_corpus_from_directory(self): direc = datapath('dirdata') invalid_dir = datapath('test_data') multi_dir = datapath() path = Path(direc).glob('*') dir_corpus = self.directory_loader(path=direc) self.assertEqual(len(dir_corpus.data), len(list(path))) with self.assertRaises(NotADirectoryError): self.directory_loader(path=invalid_dir) multi_corp = self.directory_loader(path=multi_dir) multi_path = Path(multi_dir).glob('**/*') self.assertEqual(len(multi_corp.data), len(list(multi_path)) - 1)
def test_explicit_format_with_paired(): # Use --format=fastq with input files whose extension is .txt with temporary_path("paired.1.txt") as txt1: with temporary_path("paired.2.txt") as txt2: shutil.copyfile(datapath("paired.1.fastq"), txt1) shutil.copyfile(datapath("paired.2.fastq"), txt2) run_paired( '--format=fastq -a TTAGACATAT -m 14', in1=txt1, in2=txt2, expected1='paired.m14.1.fastq', expected2='paired.m14.2.fastq', )
def test_no_trimming_legacy(): # make sure that this doesn't divide by zero execute_cli([ '-a', 'XXXXX', '-o', '/dev/null', '-p', '/dev/null', '-pe1', datapath('paired.1.fastq'), '-pe2', datapath('paired.2.fastq'), ])
def test_unmatched_read_names(): with temporary_path("swapped.1.fastq") as swapped: try: # Create a file in which reads 2 and are swapped with open(datapath('paired.1.fastq')) as f: lines = f.readlines() lines = lines[0:4] + lines[8:12] + lines[4:8] + lines[12:] with open(swapped, 'w') as f: f.writelines(lines) with redirect_stderr(): command = get_command('trim') result = command.execute( '-a XX -o out1.fastq -p out2.fastq'.split() + ['-pe1', swapped, '-pe2', datapath('paired.2.fastq')]) assert isinstance(result, tuple) assert len(result) == 2 assert result[0] != 0 finally: os.remove('out1.fastq') os.remove('out2.fastq')
def run_paired( params, in1, in2, expected1, expected2, aligners=('adapter', ), callback=None, assert_files_equal=True, error_on_rc=True, ): if type(params) is str: params = params.split() for aligner in aligners: with temporary_path('tmp1-' + expected1.format(aligner=aligner)) as p1: with temporary_path('tmp2-' + expected2.format(aligner=aligner)) as p2: p = params.copy() p += ['--aligner', aligner, '-o', p1, '-p', p2] infiles = [ datapath(i.format(aligner=aligner)) for i in (in1, in2) ] for infile_args in zip(('-pe1', '-pe2'), infiles): p.extend(infile_args) command = get_command('trim') result = command.execute(p) assert isinstance(result, tuple) assert len(result) == 2 if error_on_rc: err = result[1]['exception'] if result[ 1] and 'exception' in result[1] else None # from traceback import format_exception if result[0] != 0: if err is None: raise AssertionError("Return code {} != 0".format( result[0])) else: raise AssertionError("Return code {} != 0".format( result[0])) from err['details'][1] if assert_files_equal: assert files_equal( cutpath(expected1.format(aligner=aligner)), p1) assert files_equal( cutpath(expected2.format(aligner=aligner)), p2) if callback: callback(aligner, infiles, (p1, p2), result)
def run_interleaved(params, inpath, expected, aligners=('adapter', )): if type(params) is str: params = params.split() for aligner in aligners: with temporary_path(expected.format(aligner=aligner)) as tmp: p = params.copy() p += [ '--aligner', aligner, '-l', datapath(inpath.format(aligner=aligner)), '-L', tmp, ] command = get_command('trim') result = command.execute(p) assert isinstance(result, tuple) assert len(result) == 2 assert files_equal(cutpath(expected.format(aligner=aligner)), tmp)
def test_file_info(self): reference_nfc_filepath = datapath('nfc.txt') utils.file_info(reference_nfc_filepath)
def test_is_file_nfc(self): src_filepath_pass = datapath('nfc.txt') src_filepath_fail = datapath('nfc_fail.txt') self.assertTrue(utils.is_file_nfc(src_filepath_pass)) self.assertFalse(utils.is_file_nfc(src_filepath_fail))
def test_load_corpus_from_text(self): text = open(datapath('owe_pass')).read() corpus = self.corpus_class(text=text) self.assertEqual(len(corpus), 420)
def test_load_corpus_from_path_stream(self): path = datapath('owe_pass') corpus = self.corpus_class(path=path, fformat=self.txt_extension, stream=True) self.assertEqual(len(corpus), 420)
def test_missing_file(): with raises(SystemExit), redirect_stderr(): execute_cli([ '-a', 'XX', '--paired-output', 'out.fastq', datapath('paired.1.fastq') ])
def test_is_valid_owe_format(self): fail_path = datapath('nfc.txt') self.assertFalse(preprocessing.is_valid_owé_format(fail_path))