def test_vectordb(self): 'It removes the vector from a vector database' seq1 = create_random_seqwithquality(5, qual_range=35) vector = 'CACTATCTCCGACGACGGCGATTTCACCGTTGACCTGATTTCCAGTTGCTACGTCAAGTTC' vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector)) seq2 = create_random_seqwithquality(250, qual_range=35) seqs = [seq1 + vector + seq2] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() vector_db = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+') cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', '454', '-f', 'fastq', '-d', vector_db] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5 seq1 = create_random_seqwithquality(5, qual_range=35) vector = 'GGTGCCTCCGGCGGGCCACTCAATGCTTGAGTATACTCACTAGACTTTGCTTCGCAAAG' vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector)) seq2 = create_random_seqwithquality(250, qual_range=35) seqs = [seq1 + vector + seq2] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', '454', '-f', 'fastq', '-d'] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5
def test_illumina(self): 'It tests the Illumina cleaning' seq1 = create_random_seqwithquality(50, qual_range=35) seq2 = create_random_seqwithquality(10, qual_range=15) seqs = [seq1 + seq2] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'illumina', '-f', 'fastq'] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert out_seqs[0].qual[-2] == 35 #disable quality trimming cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'illumina', '-f', 'fastq', '-x'] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert seqs[0].seq == out_seqs[0].seq #illumina format inseq_fhand = create_temp_seq_file(seqs, format='fastq-illumina')[0] outseq_fhand = NamedTemporaryFile() cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'illumina', '-f', 'fastq-illumina'] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq-illumina')) assert out_seqs[0].qual[-2] == 35
def test_adaptors(self): 'It removes adaptors' seq1 = create_random_seqwithquality(5, qual_range=35) adaptor = create_random_seqwithquality(15, qual_range=35) seq2 = create_random_seqwithquality(50, qual_range=35) seqs = [seq1 + adaptor + seq2] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() adaptor_fhand = create_temp_seq_file([adaptor], format='fasta')[0] cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'illumina', '-f', 'fastq', '-a', adaptor_fhand.name] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert seq2.seq == out_seqs[0].seq seq1 = create_random_seqwithquality(5, qual_range=35) adaptor = create_random_seqwithquality(15, qual_range=35) seq2 = create_random_seqwithquality(50, qual_range=35) seqs = [seq1 + adaptor + seq2] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', '454', '-f', 'fastq', '-a'] stdout, stderr, retcode = _call_python(cmd) assert retcode == 0 assert "--adaptors_file: {'454': '" in stdout cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'illumina', '-f', 'fastq', '-a'] stdout, stderr, retcode = _call_python(cmd) assert 'clean_reads does not have default adaptors file' in stderr assert retcode == 14
def test_sanger(self): 'It tests the basic sanger cleaning' seq1 = create_random_seqwithquality(500, qual_range=55) seq2 = create_random_seqwithquality(50, qual_range=15) seqs = [seq1 + seq2] inseq_fhand, inqual_fhand = create_temp_seq_file(seqs, format='qual') outseq_fhand = NamedTemporaryFile() outqual_fhand = NamedTemporaryFile() #platform is required cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name] stderr = _call_python(cmd)[1] assert 'required' in stderr #a correct platform is required cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'hola'] stderr = _call_python(cmd)[1] assert 'choice' in stderr #disable quality trimming and lucy_splice are incompatible cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'sanger', '-x', '--lucy_splice', 'splice.fasta'] stderr = _call_python(cmd)[1] assert 'incompatible' in stderr #we can clean a sanger sequence with quality cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-q', inqual_fhand.name, '-o', outseq_fhand.name, '-u', outqual_fhand.name, '-p', 'sanger'] retcode = _call_python(cmd)[2] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), qual_fhand=open(outqual_fhand.name))) assert out_seqs[0].qual[-1] == 55 #disable quality trimming cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-q', inqual_fhand.name, '-o', outseq_fhand.name, '-u', outqual_fhand.name, '-p', 'sanger', '-x'] retcode = _call_python(cmd)[2] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), qual_fhand=open(outqual_fhand.name))) assert seqs[0].seq == out_seqs[0].seq #we can clean a sanger sequence without quality seq1 = create_random_seqwithquality(500, qual_range=55) seqs = [SeqWithQuality(seq1.seq + Seq('NNNNNNNNNNNNNN'), name='Ns')] inseq_fhand = create_temp_seq_file(seqs, format='fasta')[0] outseq_fhand = NamedTemporaryFile() cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'sanger'] retcode = _call_python(cmd)[2] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name))) assert not str(out_seqs[0].seq).lower().endswith('nnnnn')
def test_seq_pipeline_parallel_run_with_fasta_qual(self): 'The pipeline runs in parallel with fasta and qual' pipeline = 'sanger_with_qual' fhand_adaptors = NamedTemporaryFile() fhand_adaptors.write(ADAPTORS) fhand_adaptors.flush() arabidopsis_genes = 'arabidopsis_genes+' univec = os.path.join(TEST_DATA_DIR, 'blast', arabidopsis_genes) configuration = {'remove_vectors': {'vectors': univec}, 'remove_adaptors': {'adaptors': fhand_adaptors.name}} seq1 = create_random_seqwithquality(500, qual_range=50) seq2 = create_random_seqwithquality(500, qual_range=51) seq3 = create_random_seqwithquality(500, qual_range=52) seqs = [seq1, seq2, seq3] inseq_fhand, inqual_fhand = create_temp_seq_file(seqs, format='qual') in_fhands = {} in_fhands['in_seq'] = open(inseq_fhand.name) in_fhands['in_qual'] = open(inqual_fhand.name) outseq_fhand = NamedTemporaryFile() outqual_fhand = NamedTemporaryFile() writer = SequenceWriter(outseq_fhand, qual_fhand=outqual_fhand, file_format='fasta') writers = {'seq': writer} seq_pipeline_runner(pipeline, configuration, in_fhands, processes=4, writers=writers) out_fhand = open(outseq_fhand.name, 'r') result_seq = out_fhand.read() assert result_seq.count('>') == 3
def test_tempdir(self): 'it test that the tmpdir work fine' seq1 = create_random_seqwithquality(500, qual_range=55) seq2 = create_random_seqwithquality(50, qual_range=15) seqs = [seq1 + seq2] inseq_fhand = create_temp_seq_file(seqs, format='qual')[0] outseq_fhand = NamedTemporaryFile() cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'sanger', '--tmpdir', '.'] retcode = _call_python(cmd)[-1] assert retcode == 0 dir_without_perm = '/usr' cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'sanger', '--tmpdir', dir_without_perm] stderr, retcode = _call_python(cmd)[1:] assert retcode == 1 assert "Permission denied: '%s" % dir_without_perm in stderr dir_without_perm = '/usr/remove_this_dir' cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'sanger', '--tmpdir', dir_without_perm] stderr, retcode = _call_python(cmd)[1:] assert retcode == 14 assert "Permission denied: '%s" % dir_without_perm in stderr errolog_path = 'clean_reads.error' if os.path.exists(errolog_path): os.remove(errolog_path)
def test_vector(self): 'It removes the vector' seq1 = create_random_seqwithquality(5, qual_range=35) vector = create_random_seqwithquality(3000, qual_range=35) seq2 = create_random_seqwithquality(250, qual_range=35) seqs = [seq1 + vector[30:60] + seq2] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() vector_fhand = create_temp_seq_file([vector], format='fasta')[0] cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', '454', '-f', 'fastq', '-v', vector_fhand.name] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5
def test_edge_trim(self): 'It trims the sequence edges' seq2 = create_random_seqwithquality(250, qual_range=35) seqs = [seq2] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', '454', '-f', 'fastq', '-e', '10,10'] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert len(seq2.seq) - len(out_seqs[0].seq) == 20
def test_words(self): 'It trims re words' vector = 'ACTG' vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector)) seq2 = create_random_seqwithquality(250, qual_range=35) seqs = [vector + seq2] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', '454', '-f', 'fastq', '-r', '"^ACTG","TTTTTTTTTTTTTT"'] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert seq2.seq == out_seqs[0].seq
def test_min_length(self): 'Filtering by length' seq1 = create_random_seqwithquality(250, qual_range=35) seq2 = create_random_seqwithquality(50, qual_range=35) seq3 = create_random_seqwithquality(250, qual_range=35) seqs = [seq1, seq2, seq3] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', '454', '-f', 'fastq', '-m', '51'] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert len(out_seqs) == 2 assert len(out_seqs[0]) == 250 assert len(out_seqs[1]) == 250
def test_trim_as_mask(self): 'It masks the regions to trim' seq2 = create_random_seqwithquality(250, qual_range=35) seqs = [seq2] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', '454', '-f', 'fastq', '-e', '10,10', '--mask_no_trim'] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert len(seq2.seq) == len(out_seqs[0].seq) seq = str(out_seqs[0].seq) assert seq[0:9].islower() assert seq[10:len(seq) - 10].isupper() assert seq[-10:].islower()
def test_fastq(self): 'Cleaning fastq seqs in parallel' seq1 = create_random_seqwithquality(500, qual_range=55) seq2 = create_random_seqwithquality(50, qual_range=15) seq3 = create_random_seqwithquality(500, qual_range=55) seq4 = create_random_seqwithquality(50, qual_range=15) seq5 = create_random_seqwithquality(500, qual_range=55) seq6 = create_random_seqwithquality(50, qual_range=15) seqs = [seq1 + seq2, seq3 + seq4, seq5 + seq6] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() #we can clean a sanger sequence with quality cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', 'sanger', '-t', '4', '-f', 'fastq'] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert out_seqs[0].qual[-1] == 55
def test_filter(self): 'Filtering by blast similarity' seq1 = create_random_seqwithquality(150, qual_range=35) seq2 = 'CACTATCTCCGACGACGGCGATTTCACCGTTGACCTGATTTCCAGTTGCTACGTCAAGTTCTC' seq2 += 'TACGGCAAGAATATCGCCGGAAAACTCAGTTACGGATCTGTTAAAGACGTCCGTGGAATCCA' seq2 += 'AGCTAAAGAAGCTTTCCTTTGGCTACCAATCACCGCCATGGAATCGGATCCAAGCTCTGCCA' seq2 = SeqWithQuality(Seq(seq2), name='ara', qual=[30]*len(seq2)) seq3 = create_random_seqwithquality(150, qual_range=35) seqs = [seq1, seq2, seq3] inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0] outseq_fhand = NamedTemporaryFile() ara_db = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+') cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name, '-p', '454', '-f', 'fastq', '--filter_dbs', ','.join((ara_db, ara_db))] retcode = _call_python(cmd)[-1] assert retcode == 0 out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name), format='fastq')) assert len(out_seqs) == 2