Esempio n. 1
0
    def test_normalize_diacritics_file(self):
        nfd_filepath = datapath('nfd.txt')
        reference_nfc_filepath = datapath('nfc.txt')
        processed_nfc_filepath = datapath('processed_nfc.txt')

        self.assertTrue(preprocessing.normalize_diacritics_file(nfd_filepath, processed_nfc_filepath))
        self.assertFalse(filecmp.cmp(nfd_filepath, processed_nfc_filepath))  # src & processed are different
        self.assertTrue(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath))  # processed matches reference
Esempio n. 2
0
    def test_strip_accents_file(self):
        src_filepath = datapath('src_file.txt')
        reference_stripped_filepath = datapath('ref_proccessed_file.txt')
        processed_stripped_filepath = datapath('processed_file.txt')

        self.assertTrue(preprocessing.strip_accents_file(src_filepath, processed_stripped_filepath))
        self.assertFalse(filecmp.cmp(src_filepath, processed_stripped_filepath))
        self.assertTrue(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath))
Esempio n. 3
0
def test_second_too_short():
    with temporary_path("truncated.2.fastq") as trunc2:
        # Create a truncated file in which the last read is missing
        with open(datapath('paired.2.fastq')) as f:
            lines = f.readlines()
            lines = lines[:-4]
        with open(trunc2, 'w') as f:
            f.writelines(lines)
        with raises(SystemExit), redirect_stderr():
            execute_cli('-a XX --paired-output out.fastq'.split() +
                        [datapath('paired.1.fastq'), trunc2])
Esempio n. 4
0
 def test_load_corpus_from_directory(self):
     direc = datapath('dirdata')
     invalid_dir = datapath('test_data')
     multi_dir = datapath()
     path = Path(direc).glob('*')
     dir_corpus = self.directory_loader(path=direc)
     self.assertEqual(len(dir_corpus.data), len(list(path)))
     with self.assertRaises(NotADirectoryError):
         self.directory_loader(path=invalid_dir)
     multi_corp = self.directory_loader(path=multi_dir)
     multi_path = Path(multi_dir).glob('**/*')
     self.assertEqual(len(multi_corp.data), len(list(multi_path)) - 1)
Esempio n. 5
0
def test_explicit_format_with_paired():
    # Use --format=fastq with input files whose extension is .txt
    with temporary_path("paired.1.txt") as txt1:
        with temporary_path("paired.2.txt") as txt2:
            shutil.copyfile(datapath("paired.1.fastq"), txt1)
            shutil.copyfile(datapath("paired.2.fastq"), txt2)
            run_paired(
                '--format=fastq -a TTAGACATAT -m 14',
                in1=txt1,
                in2=txt2,
                expected1='paired.m14.1.fastq',
                expected2='paired.m14.2.fastq',
            )
Esempio n. 6
0
def test_no_trimming_legacy():
    # make sure that this doesn't divide by zero
    execute_cli([
        '-a',
        'XXXXX',
        '-o',
        '/dev/null',
        '-p',
        '/dev/null',
        '-pe1',
        datapath('paired.1.fastq'),
        '-pe2',
        datapath('paired.2.fastq'),
    ])
Esempio n. 7
0
def test_unmatched_read_names():
    with temporary_path("swapped.1.fastq") as swapped:
        try:
            # Create a file in which reads 2 and are swapped
            with open(datapath('paired.1.fastq')) as f:
                lines = f.readlines()
                lines = lines[0:4] + lines[8:12] + lines[4:8] + lines[12:]
            with open(swapped, 'w') as f:
                f.writelines(lines)
            with redirect_stderr():
                command = get_command('trim')
                result = command.execute(
                    '-a XX -o out1.fastq -p out2.fastq'.split() +
                    ['-pe1', swapped, '-pe2',
                     datapath('paired.2.fastq')])
                assert isinstance(result, tuple)
                assert len(result) == 2
                assert result[0] != 0
        finally:
            os.remove('out1.fastq')
            os.remove('out2.fastq')
Esempio n. 8
0
def run_paired(
    params,
    in1,
    in2,
    expected1,
    expected2,
    aligners=('adapter', ),
    callback=None,
    assert_files_equal=True,
    error_on_rc=True,
):
    if type(params) is str:
        params = params.split()
    for aligner in aligners:
        with temporary_path('tmp1-' + expected1.format(aligner=aligner)) as p1:
            with temporary_path('tmp2-' +
                                expected2.format(aligner=aligner)) as p2:
                p = params.copy()
                p += ['--aligner', aligner, '-o', p1, '-p', p2]
                infiles = [
                    datapath(i.format(aligner=aligner)) for i in (in1, in2)
                ]
                for infile_args in zip(('-pe1', '-pe2'), infiles):
                    p.extend(infile_args)
                command = get_command('trim')
                result = command.execute(p)
                assert isinstance(result, tuple)
                assert len(result) == 2
                if error_on_rc:
                    err = result[1]['exception'] if result[
                        1] and 'exception' in result[1] else None
                    # from traceback import format_exception
                    if result[0] != 0:
                        if err is None:
                            raise AssertionError("Return code {} != 0".format(
                                result[0]))

                        else:
                            raise AssertionError("Return code {} != 0".format(
                                result[0])) from err['details'][1]

                if assert_files_equal:
                    assert files_equal(
                        cutpath(expected1.format(aligner=aligner)), p1)
                    assert files_equal(
                        cutpath(expected2.format(aligner=aligner)), p2)
                if callback:
                    callback(aligner, infiles, (p1, p2), result)
Esempio n. 9
0
def run_interleaved(params, inpath, expected, aligners=('adapter', )):
    if type(params) is str:
        params = params.split()
    for aligner in aligners:
        with temporary_path(expected.format(aligner=aligner)) as tmp:
            p = params.copy()
            p += [
                '--aligner',
                aligner,
                '-l',
                datapath(inpath.format(aligner=aligner)),
                '-L',
                tmp,
            ]
            command = get_command('trim')
            result = command.execute(p)
            assert isinstance(result, tuple)
            assert len(result) == 2
            assert files_equal(cutpath(expected.format(aligner=aligner)), tmp)
Esempio n. 10
0
 def test_file_info(self):
     reference_nfc_filepath = datapath('nfc.txt')
     utils.file_info(reference_nfc_filepath)
Esempio n. 11
0
    def test_is_file_nfc(self):
        src_filepath_pass = datapath('nfc.txt')
        src_filepath_fail = datapath('nfc_fail.txt')

        self.assertTrue(utils.is_file_nfc(src_filepath_pass))
        self.assertFalse(utils.is_file_nfc(src_filepath_fail))
Esempio n. 12
0
 def test_load_corpus_from_text(self):
     text = open(datapath('owe_pass')).read()
     corpus = self.corpus_class(text=text)
     self.assertEqual(len(corpus), 420)
Esempio n. 13
0
 def test_load_corpus_from_path_stream(self):
     path = datapath('owe_pass')
     corpus = self.corpus_class(path=path,
                                fformat=self.txt_extension,
                                stream=True)
     self.assertEqual(len(corpus), 420)
Esempio n. 14
0
def test_missing_file():
    with raises(SystemExit), redirect_stderr():
        execute_cli([
            '-a', 'XX', '--paired-output', 'out.fastq',
            datapath('paired.1.fastq')
        ])
 def test_is_valid_owe_format(self):
     fail_path = datapath('nfc.txt')
     self.assertFalse(preprocessing.is_valid_owé_format(fail_path))