def test_calculate_stats(): 'It tests the calculate stat function' in_fhands = [] for val in range(1, 6): fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val))) in_fhands.append(fhand) seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQRECORD]) results = calculate_sequence_stats(seqs, nxs=[50]) assert 'maximum: 4' in results['length'] assert 'N50' in results['length'] assert '1:30.0,30.0,30.0,30.0,30.0 <[|]>' in results['qual_boxplot'] assert '[30 , 31[ (96): **********' in results['quality'] assert 'Q30: 100.0' in results['quality'] assert '0 (A: 1.00, C: 0.00, G: 0.00, T: 0.00' in results['nucl_freq'] assert results['kmer'] == '' infhands = [open(join(TEST_DATA_DIR, 'arabidopsis_genes'))] seqs = list(read_seqs(infhands, prefered_seq_classes=[SEQRECORD])) kmers = calculate_sequence_stats(seqs)['kmer'] assert not 'Kmer distribution' in kmers kmers = calculate_sequence_stats(seqs, kmer_size=3)['kmer'] assert 'Kmer distribution' in kmers assert 'TCT: 167' in kmers # dust dust = calculate_sequence_stats(seqs)['dustscore'] assert not dust dust = calculate_sequence_stats(seqs, do_dust_stats=True)['dustscore'] assert 'average: 1.83\nvariance: 0.14\nnum. seqs.: 6\n' in dust assert '% above 7 (low complexity): 0.00' in dust
def test_sample_seq(self): sample_seq = os.path.join(BIN_DIR, 'sample_seqs') assert 'usage' in check_output([sample_seq, '-h']) fasta_fhand = NamedTemporaryFile() fasta_fhand.write('>seq\nACTA\n>seq2\nACTA\n>seq3\nACTA\n') fasta_fhand.flush() # random sample result = check_output([sample_seq, '-n', '1', fasta_fhand.name]) assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 1 # random sample result = check_output([sample_seq, '-n', '2', fasta_fhand.name]) assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2 # random sample try: stderr = NamedTemporaryFile() check_output([sample_seq, '-n', '10', fasta_fhand.name], stderr=stderr) except CalledProcessError: assert 'larger' in open(stderr.name).read() # random sample with stdin result = check_output([sample_seq, '-n', '2'], stdin=open(fasta_fhand.name)) assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2
def test_seqitems_io(self): 'It checks the different seq class streams IO' fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQITEM])) assert seqs[0].kind == SEQITEM fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' assert seqs[0].object.name == 's1' # SeqRecord fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQRECORD])) assert seqs[0].kind == SEQRECORD fhand = StringIO() write_seqs(seqs, fhand, 'fasta') assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' # seqitem not possible with different input and output formats fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') try: seqs = list(read_seqs([fhand], out_format='fastq', prefered_seq_classes=[SEQITEM])) self.fail('ValueError expected') except ValueError: pass fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], out_format='fasta', prefered_seq_classes=[SEQITEM])) fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
def test_giuseppe_reads(): 'It splits some real reads' seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) new_seqs = [] for packet in read_seq_packets([open(seq_fpath)], 2): new_seqs.extend(splitter(packet)) seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01BJHT8\\1' in seq_names assert 'G109AZL01BJHT8\\2' in seq_names assert len(new_seqs) == 20 # test with process_seq_packet seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) seq_packets = read_seq_packets([open(seq_fpath)], 2) seq_packets = process_seq_packets(seq_packets, [splitter])[0] new_seqs = [seq for l in list(seq_packets) for seq in l] seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01BJHT8\\1' in seq_names assert 'G109AZL01BJHT8\\2' in seq_names assert len(new_seqs) == 20 # reads 2 seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) new_seqs = [] for packet in read_seq_packets([open(seq_fpath)], 2): new_seqs.extend(splitter(packet)) seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01D8U3X\\1' in seq_names assert 'G109AZL01D8U3X\\2' in seq_names assert len(new_seqs) == 20 # test with process_seq_packet seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) seq_packets = read_seq_packets([open(seq_fpath)], 2) seq_packets = process_seq_packets(seq_packets, [splitter])[0] new_seqs = [seq for l in list(seq_packets) for seq in l] seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01D8U3X\\1' in seq_names assert 'G109AZL01D8U3X\\2' in seq_names assert len(new_seqs) == 20
def _read_pairs(in_fhands, paired_reads): seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQITEM]) if paired_reads: pairs = group_pairs_by_name(seqs) else: pairs = group_pairs(seqs, n_seqs_in_pair=1) return pairs
def sort_fastx_files(in_fhands, key, index_fpath=None, directory=None, max_items_in_memory=None, tempdir=None): if key == 'seq': reads = read_seqs(in_fhands) return sorted_items(reads, key=get_str_seq, tempdir=tempdir, max_items_in_memory=max_items_in_memory) elif key == 'coordinate': return sort_by_position_in_ref(in_fhands, index_fpath=index_fpath, directory=directory, tempdir=tempdir) elif key == 'name': reads = read_seqs(in_fhands) return sorted_items(reads, key=get_name, tempdir=tempdir, max_items_in_memory=max_items_in_memory) else: raise ValueError('Non-supported sorting key')
def _read_pairs(in_fhands, paired_reads): seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQITEM]) if paired_reads: pairs = group_seqs_in_pairs(seqs) else: pairs = ([seq] for seq in seqs) return pairs
def test_count_seqs(): in_fhands = [] for val in range(1, 6): fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val))) in_fhands.append(fhand) seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQRECORD]) counts = count_seqs(seqs) assert counts == {'total_length': 96, 'num_seqs': 24}
def test_deinterleave(self): 'It de-interleaves an iterator of alternating fwd and rev reads' fhand1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') fhand2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq') fwd_seqs = read_seqs([open(fhand1)], 'fastq') rev_seqs = read_seqs([open(fhand2)], 'fastq') seqs = interleave_pairs(fwd_seqs, rev_seqs) out_fhand1 = StringIO() out_fhand2 = StringIO() out_format = 'fastq' deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format) result1 = out_fhand1.getvalue() result2 = out_fhand2.getvalue() assert result1.strip() == open(fhand1).read().strip() assert result2.strip() == open(fhand2).read().strip()
def test_filter_chimeras(self): reference_seq = GENOME #Typic non chimeric query1 = '>seq1 1:Y:18:ATCACG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTA' query1 += 'CATTGAACTT\n' query2 = '>seq1 2:Y:18:ATCACG\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAG' query2 += 'GGTTGTAACG\n' #typic chimeric query3 = '>seq2 1:Y:18:ATCACG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGT' query3 += 'CTGCGATCCCTG' query3 += 'GGTAGACTGAGGCCTTCTCGAACTACAAATCATCACCAGACCATGTCCGA\n' query4 = '>seq2 2:Y:18:ATCACG\nTTAAGGCACGTACGGTACCTAAATCGGCCTGATGGTATT' query4 += 'GATGCTGAACTT' query4 += 'ATTGCGGCTCACACACCCCTACGTTACACGCAAATGCTGCCCGAAACGTTAT\n' #Unknown, 3' end does not map, impossible to know if it is chimeric query13 = '>seq7 1:Y:18:ATCACG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCT' query13 += 'ACATTGAACTT' query13 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query14 = '>seq7 2:Y:18:ATCACG\nATCATTGCATAAGTAACACTCAACCAACAGTGCTACAG' query14 += 'GGTTGTAACGCC' query14 += 'CCTCGAAGGTACCTTTGCCAGACTGGGCTACAGGACACCCAGTCTCCCGGGAGTCT\n' query = query1 + query2 + query3 + query4 + query13 + query14 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() ref_fhand = NamedTemporaryFile() ref_fhand.write(reference_seq) ref_fhand.flush() out_fhand = NamedTemporaryFile() chimeras_fhand = NamedTemporaryFile() unknown_fhand = NamedTemporaryFile() filter_chimeras(ref_fhand.name, out_fhand, chimeras_fhand, [in_fhand], unknown_fhand) result = read_seqs([out_fhand]) chimeric = read_seqs([chimeras_fhand]) unknown = read_seqs([unknown_fhand]) for seq in result: assert get_name(seq) in ['seq1.f', 'seq1.r'] for seq in chimeric: assert get_name(seq) in ['seq2.f', 'seq2.r'] for seq in unknown: assert get_name(seq) in ['seq7.f', 'seq7.r']
def test_trim_chimeras_bin(self): trim_chimeras_bin = os.path.join(BIN_DIR, 'trim_mp_chimeras') assert 'usage' in check_output([trim_chimeras_bin, '-h']) index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n' query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query = query1 + query2 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() out_fhand = NamedTemporaryFile() expected_seqs = [ 'GGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT', 'CATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG' ] cmd = [ trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o', out_fhand.name ] #raw_input(" ".join(cmd)) check_output(cmd, stdin=in_fhand) counts = 0 for seq in read_seqs([open(out_fhand.name)]): assert get_str_seq(seq) in expected_seqs counts += 1 assert counts != 0 #With several threads cmd = [ trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o', out_fhand.name, '-p', '2' ] check_output(cmd, stdin=in_fhand) counts = 0 for seq in read_seqs([open(out_fhand.name)]): assert get_str_seq(seq) in expected_seqs counts += 1 assert counts != 0
def test_seqitems_io(self): 'It checks the different seq class streams IO' fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], 'fasta', prefered_seq_classes=[SEQITEM])) assert seqs[0].kind == SEQITEM fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' assert seqs[0].object.name == 's1' # SeqRecord fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list( read_seqs([fhand], 'fasta', prefered_seq_classes=[SEQRECORD])) assert seqs[0].kind == SEQRECORD fhand = StringIO() write_seqs(seqs, fhand, 'fasta') assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' # seqitem not possible with different input and output formats fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') try: seqs = list( read_seqs([fhand], 'fasta', out_format='fastq', prefered_seq_classes=[SEQITEM])) self.fail('ValueError expected') except ValueError: pass fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list( read_seqs([fhand], 'fasta', out_format='fasta', prefered_seq_classes=[SEQITEM])) fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
def test_sample_seq(self): 'It tests the seq head' sample_seq = os.path.join(BIN_DIR, 'sample_seqs') assert 'usage' in check_output([sample_seq, '-h']) fasta_fhand = NamedTemporaryFile() fasta_fhand.write('>seq\nACTA\n>seq2\nACTA\n>seq3\nACTA\n') fasta_fhand.flush() # random sample result = check_output([sample_seq, '-n', '1', fasta_fhand.name]) assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 1 # random sample result = check_output([sample_seq, '-n', '2', fasta_fhand.name]) assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2 # random sample with stdin result = check_output([sample_seq, '-n', '2'], stdin=open(fasta_fhand.name)) assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2
def test_trim_chimeras_bin(self): trim_chimeras_bin = os.path.join(BIN_DIR, 'trim_mp_chimeras') assert 'usage' in check_output([trim_chimeras_bin, '-h']) index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n' query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query = query1 + query2 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() out_fhand = NamedTemporaryFile() expected_seqs = ['GGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT', 'CATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG'] cmd = [trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o', out_fhand.name] #raw_input(" ".join(cmd)) check_output(cmd, stdin=in_fhand) counts = 0 for seq in read_seqs([open(out_fhand.name)]): assert get_str_seq(seq) in expected_seqs counts += 1 assert counts != 0 #With several threads cmd = [trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o', out_fhand.name, '-p', '2'] check_output(cmd, stdin=in_fhand) counts = 0 for seq in read_seqs([open(out_fhand.name)]): assert get_str_seq(seq) in expected_seqs counts += 1 assert counts != 0
def test_interleave(self): 'It interleaves two iterators with paired reads' file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = list(read_seqs([open(file1)], 'fastq')) rev_seqs = list(read_seqs([open(file2)], 'fastq')) try: list(interleave_pairs(fwd_seqs, rev_seqs)) self.fail('InterleaveError expected') except InterleaveError: pass # we skip the tests seqs = list(interleave_pairs(fwd_seqs, rev_seqs, skip_checks=True)) assert len(seqs) == 8 file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq') fwd_seqs = read_seqs([open(file1)], 'fastq') rev_seqs = read_seqs([open(file2)], 'fastq') seqs = list(interleave_pairs(fwd_seqs, rev_seqs)) assert len(seqs) == 8
def test_calculate_stats_seqitems(): 'It tests the calculate stat function with seqitems' in_fhands = [] for val in range(1, 6): fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val))) in_fhands.append(fhand) seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQITEM]) results = calculate_sequence_stats(seqs, nxs=[50]) assert 'maximum: 4' in results['length'] assert 'N50' in results['length'] assert '1:30.0,30.0,30.0,30.0,30.0 <[|]>' in results['qual_boxplot'] assert '[30 , 31[ (96): **********' in results['quality'] assert 'Q30: 100.0' in results['quality'] assert '0 (A: 1.00, C: 0.00, G: 0.00, T: 0.00' in results['nucl_freq'] assert results['kmer'] == ''
def test_orf_annotator(self): 'It tests orf annotator' fpath = os.path.join(TEST_DATA_DIR, 'orf_test.fasta') estscan_matrix = os.path.join(TEST_DATA_DIR, 'Arabidopsis_thaliana.smat') seq_records = list( read_seqs([open(fpath)], prefered_seq_classes=[SEQRECORD])) orf_annotator = EstscanOrfAnnotator(estscan_matrix) seq_records = orf_annotator(seq_records) orf1 = seq_records[0].object.features[0] orf2 = seq_records[1].object.features[0] assert orf1.strand == 1 assert orf1.location.start.position == 0 assert orf1.location.end.position == 541 assert orf2.strand == -1 assert orf2.location.start.position == 0 assert orf2.location.end.position == 541 assert not seq_records[2].object.features
def test_orf_annotator(self): 'It tests orf annotator' fpath = os.path.join(TEST_DATA_DIR, 'orf_test.fasta') estscan_matrix = os.path.join(TEST_DATA_DIR, 'Arabidopsis_thaliana.smat') seq_records = list(read_seqs([open(fpath)], prefered_seq_classes=[SEQRECORD])) orf_annotator = EstscanOrfAnnotator(estscan_matrix) seq_records = orf_annotator(seq_records) orf1 = seq_records[0].object.features[0] orf2 = seq_records[1].object.features[0] assert orf1.strand == 1 assert orf1.location.start.position == 0 assert orf1.location.end.position == 541 assert orf2.strand == -1 assert orf2.location.start.position == 0 assert orf2.location.end.position == 541 assert not seq_records[2].object.features
def _read_estcan_result(fhand, result, file_type): 'It reads a dna or pep ESTscan result file' for seq in read_seqs([fhand], file_format='fasta'): items = [i.strip() for i in get_description(seq).split(';')] strand = -1 if 'minus strand' in items else 1 start, end = items[0].split(' ', 3)[1:3] # estscan changes the name, we have to fix it seqid = get_name(seq).strip(';') try: seq_orfs = result[seqid] except KeyError: seq_orfs = {} result[seqid] = seq_orfs orf_key = (int(start), int(end), strand) if orf_key in seq_orfs: orf = seq_orfs[orf_key] else: orf = {} seq_orfs[orf_key] = orf orf[file_type] = get_str_seq(seq)
def _get_chrom_lengths(self): chrom_lens = OrderedDict() if self._ref_fhand is None: vcf_fhand = gzip.open(self._reader.fhand.name) for line in vcf_fhand: line = line.strip() if line.startswith('#'): continue items = line.split() chrom = items[0] loc = int(items[1]) if chrom not in chrom_lens: chrom_lens[chrom] = loc else: if loc > chrom_lens[chrom]: chrom_lens[chrom] = loc else: for read in read_seqs([self._ref_fhand]): chrom_lens[get_name(read)] = get_length(read) return chrom_lens
def filter_chimeras(ref_fpath, out_fhand, chimeras_fhand, in_fhands, unknown_fhand, unpaired=False, paired_result=True, settings=get_setting('CHIMERAS_SETTINGS'), min_seed_len=None, directory=None): file_format = get_format(in_fhands[0]) if unpaired: unpaired_fpaths = [fhand.name for fhand in in_fhands] paired_fpaths = None else: f_fhand = NamedTemporaryFile() r_fhand = NamedTemporaryFile() seqs = read_seqs(in_fhands) deinterleave_pairs(seqs, f_fhand, r_fhand, file_format) paired_fpaths = [f_fhand.name, r_fhand.name] unpaired_fpaths = None bamfile = _sorted_mapped_reads(ref_fpath, paired_fpaths, unpaired_fpaths, directory, file_format, min_seed_len) total = 0 chimeric = 0 unknown = 0 for pair, kind in classify_mapped_reads(bamfile, settings=settings, paired_result=paired_result, file_format=file_format): if kind is NON_CHIMERIC: write_seqs(pair, out_fhand) elif kind is CHIMERA and chimeras_fhand is not None: write_seqs(pair, chimeras_fhand) chimeric += 1 elif kind is UNKNOWN and unknown_fhand is not None: write_seqs(pair, unknown_fhand) unknown += 1 total += 1 mapped = total - chimeric - unknown print 'Total pairs analyzed: ', total print 'Chimeric pairs filtered: ', chimeric, '\t', chimeric / float(total) print 'Unknown pairs found: ', unknown, '\t', unknown / float(total) print 'Non-chimeric pairs: ', mapped, '\t', mapped / float(total)
def test_pair_matcher(self): 'It test the pair matcher function' # with equal seqs but the last ones file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)], file_format='fastq') rev_seqs = read_seqs([open(file2)], file_format='fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # with the firsts seqs different file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq') fwd_seqs = read_seqs([open(file1)], 'fastq') rev_seqs = read_seqs([open(file2)], 'fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)], 'fastq') rev_seqs = read_seqs([open(file2)], 'fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # with reads with no direcction file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)], 'fastq') rev_seqs = read_seqs([open(file2)], 'fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp # File is not sorted file1 = '''@s1.f AACCAGTCAAC + CCCFFFFFGHH @s2.f AACCAGTCAAC + CCCFFFFFGHH @s1.r AACCAGTCAAC + CCCFFFFFGHH ''' file1 = StringIO(file1) seqs = read_seqs([file1], 'fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' try: match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() self.fail('MalformedFile error expected') except MalformedFile: pass
def test_mate_pair_checker(): 'It test the pair matcher function' # with equal seqs but the last ones file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)], file_format='fastq') rev_seqs = read_seqs([open(file2)], file_format='fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # with the firsts seqs different file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq') fwd_seqs = read_seqs([open(file1)], 'fastq') rev_seqs = read_seqs([open(file2)], 'fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)], 'fastq') rev_seqs = read_seqs([open(file2)], 'fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # with reads with no direcction file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)], 'fastq') rev_seqs = read_seqs([open(file2)], 'fastq') out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp
def test_mate_pair_unorderer_checker(): 'It test the mate pair function' # with equal seqs but the last ones file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fhand = NamedTemporaryFile() fhand.write(open(file1).read()) fhand.write(open(file2).read()) fhand.flush() seqs = read_seqs([fhand]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, ordered=False) output = out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # with the firsts seqs different file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq') fhand = NamedTemporaryFile() fhand.write(open(file1).read()) fhand.write(open(file2).read()) fhand.flush() seqs = read_seqs([fhand]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, ordered=False) output = out_fhand.getvalue() assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fhand = NamedTemporaryFile() fhand.write(open(file1).read()) fhand.write(open(file2).read()) fhand.flush() seqs = read_seqs([fhand]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, ordered=False) output = out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # unordered file file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2_unordered.sfastq') fhand = NamedTemporaryFile() fhand.write(open(file1).read()) fhand.write(open(file2).read()) fhand.flush() seqs = read_seqs([fhand]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, ordered=False) output = out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # with reads with no direcction file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fhand = NamedTemporaryFile() fhand.write(open(file1).read()) fhand.write(open(file2).read()) fhand.flush() seqs = read_seqs([fhand]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, ordered=False) output = out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp
def _get_seq_lengths(fhand): return {get_name(seq): get_length(seq) for seq in read_seqs([fhand])}
def test_pair_matcher(self): "It test the pair matcher function" # with equal seqs but the last ones file1 = os.path.join(TEST_DATA_DIR, "pairend1.sfastq") file2 = os.path.join(TEST_DATA_DIR, "pairend2.sfastq") fwd_seqs = read_seqs([open(file1)], file_format="fastq") rev_seqs = read_seqs([open(file2)], file_format="fastq") out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = "fastq" seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert "@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG" in output assert "@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output orp = orphan_out_fhand.getvalue() assert "@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in orp # with the firsts seqs different file1 = os.path.join(TEST_DATA_DIR, "pairend1.sfastq") file2 = os.path.join(TEST_DATA_DIR, "pairend3.sfastq") fwd_seqs = read_seqs([open(file1)], "fastq") rev_seqs = read_seqs([open(file2)], "fastq") out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = "fastq" seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert "@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG" in output assert "@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output orp = orphan_out_fhand.getvalue() assert "@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG" in orp assert "@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in orp assert "@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in orp file1 = os.path.join(TEST_DATA_DIR, "pairend4.sfastq") file2 = os.path.join(TEST_DATA_DIR, "pairend2.sfastq") fwd_seqs = read_seqs([open(file1)], "fastq") rev_seqs = read_seqs([open(file2)], "fastq") out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = "fastq" seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert "@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG" in output assert "@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output orp = orphan_out_fhand.getvalue() assert "@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in orp assert "@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in orp # with reads with no direcction file1 = os.path.join(TEST_DATA_DIR, "pairend7.sfastq") file2 = os.path.join(TEST_DATA_DIR, "pairend2.sfastq") fwd_seqs = read_seqs([open(file1)], "fastq") rev_seqs = read_seqs([open(file2)], "fastq") out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = "fastq" seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert "@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG" in output assert "@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output assert "@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output assert "@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG" in output orp = orphan_out_fhand.getvalue() assert "@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1" in orp assert "@seq7:136:FC706VJ:2:2104:15343:197393.hhhh" in orp assert "@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC" in orp # File is not sorted file1 = """@s1.f AACCAGTCAAC + CCCFFFFFGHH @s2.f AACCAGTCAAC + CCCFFFFFGHH @s1.r AACCAGTCAAC + CCCFFFFFGHH """ file1 = StringIO(file1) seqs = read_seqs([file1], "fastq") out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = "fastq" try: match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() self.fail("MalformedFile error expected") except MalformedFile: pass
def test_pair_matcher(self): 'It test the pair matcher function' # with equal seqs but the last ones file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)]) rev_seqs = read_seqs([open(file2)]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # with the firsts seqs different file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq') fwd_seqs = read_seqs([open(file1)]) rev_seqs = read_seqs([open(file2)]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)]) rev_seqs = read_seqs([open(file2)]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # with reads with no direcction file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)]) rev_seqs = read_seqs([open(file2)]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp # File is not sorted file1 = '''@s1.f AACCAGTCAAC + CCCFFFFFGHH @s2.f AACCAGTCAAC + CCCFFFFFGHH @s1.r AACCAGTCAAC + CCCFFFFFGHH ''' file1 = StringIO(file1) set_format(file1, 'fastq') seqs = read_seqs([file1]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' try: match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, check_order_buffer_size=10) output = out_fhand.getvalue() self.fail('ItemsNotSortedError error expected') except ItemsNotSortedError: pass
def test_bin_transcrip_orientator(self): 'it tests the transcript orientator binary' orientate_bin = os.path.join(BIN_DIR, 'orientate_transcripts') assert 'usage' in check_output([orientate_bin, '-h']) in_fpath = os.path.join(TEST_DATA_DIR, 'seqs_to_orientate.fasta') estscan_matrix = os.path.join(TEST_DATA_DIR, 'Arabidopsis_thaliana.smat') blastdb1 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'arabidopsis_genes') blastdb2 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'calabaza') out_fhand = NamedTemporaryFile() cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d', blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001', '-v', '0.0001', in_fpath, '-o', out_fhand.name, '--polya_min_len', '4'] check_output(cmd) out_seqs = list(read_seqs([open(out_fhand.name)], prefered_seq_classes=[SEQRECORD])) init_seqs = list(read_seqs([open(in_fpath)], prefered_seq_classes=[SEQRECORD])) assert get_str_seq(init_seqs[0]) == get_str_seq(out_seqs[0]) out_seq1 = str(out_seqs[1].object.seq.reverse_complement()) assert str(init_seqs[1].object.seq) == out_seq1 assert 'polyA' in out_seqs[1].object.description assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq) out_seq4 = str(out_seqs[4].object.seq.reverse_complement()) assert str(init_seqs[4].object.seq) == out_seq4 assert 'estscan_orf' in out_seqs[4].object.description assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq) out_seq6 = str(out_seqs[6].object.seq.reverse_complement()) assert str(init_seqs[6].object.seq) == out_seq6 assert 'blast arabidopsis_genes' in out_seqs[6].object.description cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d', blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001', in_fpath] stderr = NamedTemporaryFile() try: check_output(cmd, stderr=stderr) self.fail() except CalledProcessError: stde = open(stderr.name).read() assert 'Blast parameters are not well defined' in stde # witouth parameters out_fhand = NamedTemporaryFile() check_output([orientate_bin, in_fpath, '-o', out_fhand.name, '--polya_min_len', '4']) out_seqs = list(read_seqs([open(out_fhand.name)], prefered_seq_classes=[SEQRECORD])) init_seqs = list(read_seqs([open(in_fpath)], prefered_seq_classes=[SEQRECORD])) assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq) out_seq1 = str(out_seqs[1].object.seq.reverse_complement()) assert str(init_seqs[1].object.seq) == out_seq1 assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq) assert str(init_seqs[4].object.seq) == str(out_seqs[4].object.seq) assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq) assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq) # only with orf annotator check_output([orientate_bin, in_fpath, '-o', out_fhand.name, '-u', estscan_matrix, '--polya_min_len', '4']) out_seqs = list(read_seqs([open(out_fhand.name)], prefered_seq_classes=[SEQRECORD])) init_seqs = list(read_seqs([open(in_fpath)], prefered_seq_classes=[SEQRECORD])) assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq) out_seq1 = str(out_seqs[1].object.seq.reverse_complement()) assert str(init_seqs[1].object.seq) == out_seq1 assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq) out_seq4 = str(out_seqs[4].object.seq.reverse_complement()) assert str(init_seqs[4].object.seq) == out_seq4 assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq) assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq) # multiprocessor out_fhand = NamedTemporaryFile() cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d', blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001', '-v', '0.0001', in_fpath, '-o', out_fhand.name, '-p', '2', '--polya_min_len', '4'] check_output(cmd) out_seqs = list(read_seqs([open(out_fhand.name)], prefered_seq_classes=[SEQRECORD])) init_seqs = list(read_seqs([open(in_fpath)], prefered_seq_classes=[SEQRECORD])) assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq) out_seq1 = str(out_seqs[1].object.seq.reverse_complement()) assert str(init_seqs[1].object.seq) == out_seq1 assert 'polyA' in out_seqs[1].object.description assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq) out_seq4 = str(out_seqs[4].object.seq.reverse_complement()) assert str(init_seqs[4].object.seq) == out_seq4 assert 'estscan_orf' in out_seqs[4].object.description assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq) out_seq6 = str(out_seqs[6].object.seq.reverse_complement()) assert str(init_seqs[6].object.seq) == out_seq6 assert 'blast arabidopsis_genes' in out_seqs[6].object.description