Example #1
0
    def test_set_sff_trimpoints(self):
        _, orig_reads = parse_binary_sff(open(self.sff_fp), True)
        orig_reads = list(orig_reads)

        set_sff_trimpoints(self.sff_dir, {'F6AVWTA01': 10})

        _, reads = parse_binary_sff(open(self.sff_fp), True)
        for read, orig_read in zip(reads, orig_reads):
            self.assertEqual(read['clip_qual_left'], 11)
            # Check that eveything else is the same between original
            # reads and trimmed reads.
            orig_read['clip_qual_left'] = 11
            self.assertEqual(read, orig_read)
    def test_set_sff_trimpoints(self):
        _, orig_reads = parse_binary_sff(open(self.sff_fp), True)
        orig_reads = list(orig_reads)

        set_sff_trimpoints(self.sff_dir, {'F6AVWTA01': 10})

        _, reads = parse_binary_sff(open(self.sff_fp), True)
        for read, orig_read in zip(reads, orig_reads):
            self.assertEqual(read['clip_qual_left'], 11)
            # Check that eveything else is the same between original
            # reads and trimmed reads.
            orig_read['clip_qual_left'] = 11
            self.assertEqual(read, orig_read)
Example #3
0
    def test_adjust_sff_cycles(self):
        sff_data = parse_binary_sff(open(self.sff_fp))
        sff_gz_data = parse_binary_sff(qiime_open(self.sff_gz_fp))
        header, reads = adjust_sff_cycles(sff_data, 2)
        header_gz, reads_gz = adjust_sff_cycles(sff_gz_data, 2)
        expected_header = {
            'header_length': 48,
            'version': 1,
            'index_length': 0,
            'magic_number': 779314790,
            'number_of_flows_per_read': 8,
            'flowgram_format_code': 1,
            'flow_chars': 'TACGTACG',
            'index_offset': 0,
            'key_sequence': 'TCAG',
            'number_of_reads': 1,
            'key_length': 4,
        }
        self.assertEqual(header, expected_header)
        self.assertEqual(header_gz, expected_header)

        expected_read = {
            'name_length':
            14,
            'Name':
            'FA6P1OK01CGMHQ',
            'flowgram_values':
            [1.04, 0.0, 1.01, 0.0, 0.0, 0.95999999999999996, 0.0, 1.02],
            'clip_adapter_left':
            0,
            'read_header_length':
            32,
            'Bases':
            'TCAG',
            'number_of_bases':
            4,
            'flow_index_per_base': (1, 2, 3, 2),
            'clip_qual_left':
            4,
            'clip_adapter_right':
            0,
            'clip_qual_right':
            4,
            'quality_scores': (32, 32, 32, 32),
        }
        reads = list(reads)
        reads_gz = list(reads_gz)
        self.assertEqual(len(reads), 1)
        self.assertEqual(len(reads_gz), 1)
        self.assertEqual(reads[0], expected_read)
        self.assertEqual(reads_gz[0], expected_read)
    def test_adjust_sff_cycles(self):
        sff_data = parse_binary_sff(open(self.sff_fp))
        header, reads = adjust_sff_cycles(sff_data, 2)
        expected_header = {
            'header_length': 48,
            'version': 1,
            'index_length': 0,
            'magic_number': 779314790,
            'number_of_flows_per_read': 8,
            'flowgram_format_code': 1,
            'flow_chars': 'TACGTACG',
            'index_offset': 0,
            'key_sequence': 'TCAG',
            'number_of_reads': 1,
            'key_length': 4,
            }
        self.assertEqual(header, expected_header)

        expected_read = {
            'name_length': 14,
            'Name': 'FA6P1OK01CGMHQ',
            'flowgram_values': [1.04, 0.0, 1.01, 0.0, 0.0, 0.95999999999999996, 0.0, 1.02],
            'clip_adapter_left': 0,
            'read_header_length': 32,
            'Bases': 'TCAG',
            'number_of_bases': 4,
            'flow_index_per_base': (1, 2, 3, 2),
            'clip_qual_left': 4,
            'clip_adapter_right': 0,
            'clip_qual_right': 4,
            'quality_scores': (32, 32, 32, 32),
            }
        reads = list(reads)
        self.assertEqual(len(reads), 1)
        self.assertEqual(reads[0], expected_read)
Example #5
0
 def test_parse_sff(self):
     header, reads = parse_binary_sff(self.sff_file)
     self.assertEqual(header, COMMON_HEADER)
     counter = 0
     for read in reads:
         self.assertEqual(
             len(read['flowgram_values']), header['number_of_flows_per_read'])
         counter += 1
     self.assertEqual(counter, 20)
Example #6
0
    def _check_unmodified_sff_contents(self, sff_file):
        """Extracting repeated code from sfffile tests"""
        sff_file.seek(0)
        header, reads_gen = parse_binary_sff(sff_file)
        reads = list(reads_gen)

        self.assertEqual(header["number_of_reads"], 1)
        self.assertEqual(len(reads), 1)
        self.assertEqual(reads[0]['Name'], 'FA6P1OK01CGMHQ')
Example #7
0
 def test_parse_sff(self):
     header, reads = parse_binary_sff(self.sff_file)
     self.assertEqual(header, COMMON_HEADER)
     counter = 0
     for read in reads:
         self.assertEqual(len(read['flowgram_values']),
                          header['number_of_flows_per_read'])
         counter += 1
     self.assertEqual(counter, 20)
    def test_combine_sff_data(self):
        sff_datasets = [parse_binary_sff(open(fp)) for fp in self.sff_fps]
        observed_header, observed_reads = combine_sff_data(*sff_datasets)
        self.assertEqual(observed_header, combined_header)

        observed_reads = list(observed_reads)
        self.assertEqual(len(observed_reads), 40)
        observed_ids = [r['Name'] for r in observed_reads]
        self.assertEqual(observed_ids, combined_ids)
    def test_combine_sff_data(self):
        sff_datasets = [parse_binary_sff(open(fp)) for fp in self.sff_fps]
        observed_header, observed_reads = combine_sff_data(*sff_datasets)
        self.assertEqual(observed_header, combined_header)

        observed_reads = list(observed_reads)
        self.assertEqual(len(observed_reads), 40)
        observed_ids = [r['Name'] for r in observed_reads]
        self.assertEqual(observed_ids, combined_ids)
Example #10
0
    def _check_unmodified_sff_contents(self, sff_file):
        """Extracting repeated code from sfffile tests"""
        sff_file.seek(0)
        header, reads_gen = parse_binary_sff(sff_file)
        reads = list(reads_gen)

        self.assertEqual(header["number_of_reads"], 1)
        self.assertEqual(len(reads), 1)
        self.assertEqual(reads[0]['Name'], 'FA6P1OK01CGMHQ')
Example #11
0
def convert_Ti_to_FLX(sff_fp, output_fp, use_sfftools=False):
    """Converts Titanium SFF to FLX length reads."""
    if use_sfftools:
        check_sfffile()
        _check_call(
            ['sfffile', '-flx', '-o', output_fp, sff_fp],
            stdout=open(os.devnull, 'w'))
    else:
        header, reads = adjust_sff_cycles(parse_binary_sff(open(sff_fp), True), 100)
        write_binary_sff(open(output_fp, 'w'), header, reads)
Example #12
0
def convert_Ti_to_FLX(sff_fp, output_fp, use_sfftools=False):
    """Converts Titanium SFF to FLX length reads."""
    if use_sfftools:
        check_sfffile()
        _check_call(['sfffile', '-flx', '-o', output_fp, sff_fp],
                    stdout=open(os.devnull, 'w'))
    else:
        header, reads = adjust_sff_cycles(parse_binary_sff(open(sff_fp), True),
                                          100)
        write_binary_sff(open(output_fp, 'w'), header, reads)
Example #13
0
    def test_set_clip_qual_left(self):
        orig_header, orig_reads = parse_binary_sff(open(self.sff_fp), True)
        orig_reads = list(orig_reads)

        _, clip_reads = set_clip_qual_left((orig_header, orig_reads), 8)

        for read, orig_read in zip(clip_reads, orig_reads):
            self.assertEqual(read["clip_qual_left"], 9)
            # Check that eveything else is the same between original
            # reads and trimmed reads.
            orig_read["clip_qual_left"] = 9
            self.assertEqual(read, orig_read)
Example #14
0
def make_per_library_sff(sff_fps, id_list_fp, debug=False):
    id_list_basepath, _ = os.path.splitext(id_list_fp)
    output_fp = id_list_basepath + '.sff'

    sff_datasets = [parse_binary_sff(open(fp), True) for fp in sff_fps]
    sff_data = combine_sff_data(*sff_datasets)
    ids = parse_id_list(open(id_list_fp))

    filtered_sff_data = filter_sff_reads(sff_data, ids_to_keep=ids)
    if debug:
        print 'Creating SFF file for %s' % id_list_fp
    write_binary_sff(open(output_fp, 'w'), *filtered_sff_data)
Example #15
0
    def test_set_clip_qual_left(self):
        orig_header, orig_reads = parse_binary_sff(open(self.sff_fp), True)
        orig_reads = list(orig_reads)

        _, clip_reads = set_clip_qual_left((orig_header, orig_reads), 8)

        for read, orig_read in zip(clip_reads, orig_reads):
            self.assertEqual(read['clip_qual_left'], 9)
            # Check that eveything else is the same between original
            # reads and trimmed reads.
            orig_read['clip_qual_left'] = 9
            self.assertEqual(read, orig_read)
def make_per_library_sff(sff_fps, id_list_fp, debug=False):
    id_list_basepath, _ = os.path.splitext(id_list_fp)
    output_fp = id_list_basepath + '.sff'

    sff_datasets = [parse_binary_sff(open(fp), True) for fp in sff_fps]
    sff_data = combine_sff_data(*sff_datasets)
    ids = parse_id_list(open(id_list_fp))

    filtered_sff_data = filter_sff_reads(sff_data, ids_to_keep=ids)
    if debug:
        print 'Creating SFF file for %s' % id_list_fp
    write_binary_sff(open(output_fp, 'w'), *filtered_sff_data)
Example #17
0
def format_binary_sff_as_fna(sff_file, output_file=None, qual=False):
    """Write a binary SFF file to an output file, in FASTA format.

    If no output file is provided, an in-memory file-like buffer is
    used (namely, a StringIO object).
    """
    # TODO: Move to PyCogent
    if output_file is None:
        output_file = StringIO()
    _, reads = parse_binary_sff(sff_file)
    for read in reads:
        output_file.write(format_read_as_fna(read, qual))
    return output_file
Example #18
0
def format_binary_sff_as_fna(sff_file, output_file=None, qual=False):
    """Write a binary SFF file to an output file, in FASTA format.

    If no output file is provided, an in-memory file-like buffer is
    used (namely, a StringIO object).
    """
    # TODO: Move to PyCogent
    if output_file is None:
        output_file = StringIO()
    _, reads = parse_binary_sff(sff_file)
    for read in reads:
        output_file.write(format_read_as_fna(read, qual))
    return output_file
Example #19
0
    def test_set_sff_trimpoints_with_sfftools(self):
        _, orig_reads = parse_binary_sff(open(self.sff_fp), True)
        orig_reads = list(orig_reads)

        set_sff_trimpoints_with_sfftools(self.sff_dir, {'F6AVWTA01': 10})

        # check trimpoint file
        for line in open(self.sff_fp + '.trim'):
            toks = line.split()
            trim_start = int(toks[1])
            trim_end = int(toks[2])
            self.assertTrue(trim_start <= trim_end)
            self.assertEqual(trim_start, 11)

        # Check resultant SFF file
        _, reads = parse_binary_sff(open(self.sff_fp), True)
        for read, orig_read in zip(reads, orig_reads):
            self.assertEqual(read['clip_qual_left'], 11)
            # Check that eveything else is the same between original
            # reads and trimmed reads.
            orig_read['clip_qual_left'] = 11
            self.assertEqual(read, orig_read)
    def test_set_sff_trimpoints_with_sfftools(self):
        _, orig_reads = parse_binary_sff(open(self.sff_fp), True)
        orig_reads = list(orig_reads)

        set_sff_trimpoints_with_sfftools(self.sff_dir, {'F6AVWTA01': 10})

        # check trimpoint file
        for line in open(self.sff_fp + '.trim'):
            toks = line.split()
            trim_start = int(toks[1])
            trim_end = int(toks[2])
            self.assertTrue(trim_start <= trim_end)
            self.assertEqual(trim_start, 11)

        # Check resultant SFF file
        _, reads = parse_binary_sff(open(self.sff_fp), True)
        for read, orig_read in zip(reads, orig_reads):
            self.assertEqual(read['clip_qual_left'], 11)
            # Check that eveything else is the same between original
            # reads and trimmed reads.
            orig_read['clip_qual_left'] = 11
            self.assertEqual(read, orig_read)
    def test_make_per_library_sff(self):
        id_list_file = tempfile.NamedTemporaryFile()
        id_list_file.write('GA202I001ER3QL\nGA202I001DBRNC\nGA202I001DJLC5\n')
        id_list_file.seek(0)

        make_per_library_sff(self.sff_fps, id_list_file.name)

        header, reads = parse_binary_sff(open(id_list_file.name + '.sff'))
        self.assertEquals(header, per_library_header)

        self.assertEqual(reads.next()['Name'], 'GA202I001ER3QL')
        self.assertEqual(reads.next()['Name'], 'GA202I001DBRNC')
        self.assertEqual(reads.next()['Name'], 'GA202I001DJLC5')
        self.assertRaises(StopIteration, reads.next)
Example #22
0
    def test_make_per_library_sff(self):
        id_list_file = tempfile.NamedTemporaryFile()
        id_list_file.write('GA202I001ER3QL\nGA202I001DBRNC\nGA202I001DJLC5\n')
        id_list_file.seek(0)

        make_per_library_sff(self.sff_fps, id_list_file.name)

        header, reads = parse_binary_sff(open(id_list_file.name + '.sff'))
        self.assertEquals(header, per_library_header)

        self.assertEqual(reads.next()['Name'], 'GA202I001ER3QL')
        self.assertEqual(reads.next()['Name'], 'GA202I001DBRNC')
        self.assertEqual(reads.next()['Name'], 'GA202I001DJLC5')
        self.assertRaises(StopIteration, reads.next)
Example #23
0
    def test_make_per_library_sff_with_sfffile(self):
        id_list_file = tempfile.NamedTemporaryFile()
        id_list_file.write('GA202I001ER3QL\nGA202I001DBRNC\nGA202I001DJLC5\n')
        id_list_file.seek(0)

        make_per_library_sff_with_sfffile(self.sff_fps, id_list_file.name)

        header, reads = parse_binary_sff(open(id_list_file.name + '.sff'))
        # The index length varies between versions of sfftools
        del header['index_length']
        self.assertEquals(header, per_library_header_sfffile)

        self.assertEqual(reads.next()['Name'], 'GA202I001ER3QL')
        self.assertEqual(reads.next()['Name'], 'GA202I001DBRNC')
        self.assertEqual(reads.next()['Name'], 'GA202I001DJLC5')
        self.assertRaises(StopIteration, reads.next)
    def test_make_per_library_sff_with_sfffile(self):
        id_list_file = tempfile.NamedTemporaryFile()
        id_list_file.write('GA202I001ER3QL\nGA202I001DBRNC\nGA202I001DJLC5\n')
        id_list_file.seek(0)

        make_per_library_sff_with_sfffile(self.sff_fps, id_list_file.name)

        header, reads = parse_binary_sff(open(id_list_file.name + '.sff'))
        # The index length varies between versions of sfftools
        del header['index_length']
        self.assertEquals(header, per_library_header_sfffile)

        self.assertEqual(reads.next()['Name'], 'GA202I001ER3QL')
        self.assertEqual(reads.next()['Name'], 'GA202I001DBRNC')
        self.assertEqual(reads.next()['Name'], 'GA202I001DJLC5')
        self.assertRaises(StopIteration, reads.next)
Example #25
0
    def test_call_with_excluded_accession_numbers(self):
        """Sfffile should exclude specified accession numbers in output."""
        accno_file = tempfile.NamedTemporaryFile()
        accno_file.write('FA6P1OK01CGMHQ\n')
        accno_file.seek(0)

        a = Sfffile()
        a.Parameters['-e'].on(accno_file.name)
        app_results = a(self.sff_fp)

        header, reads_gen = parse_binary_sff(app_results['sff'])
        reads = list(reads_gen)

        self.assertEqual(header["number_of_reads"], 0)
        self.assertEqual(len(reads), 0)
        app_results.cleanUp()
Example #26
0
def set_sff_trimpoints(sff_dir, technical_lengths):
    """Set trimpoints to end of technical read for all SFF files in directory.
    """
    for lib_id, sff_fp in get_per_lib_sff_fps(sff_dir):
        try:
            readlength = technical_lengths[lib_id]
        except KeyError:
            continue
        sff_data = parse_binary_sff(open(sff_fp), True)
        clipped_header, clipped_reads = set_clip_qual_left(sff_data, readlength)

        _, temp_fp = tempfile.mkstemp(dir=sff_dir)
        with open(temp_fp, "w") as f:
            write_binary_sff(f, clipped_header, clipped_reads)

        shutil.move(temp_fp, sff_fp)
Example #27
0
    def test_call_with_excluded_accession_numbers(self):
        """Sfffile should exclude specified accession numbers in output."""
        accno_file = tempfile.NamedTemporaryFile()
        accno_file.write('FA6P1OK01CGMHQ\n')
        accno_file.seek(0)

        a = Sfffile()
        a.Parameters['-e'].on(accno_file.name)
        app_results = a(self.sff_fp)

        header, reads_gen = parse_binary_sff(app_results['sff'])
        reads = list(reads_gen)

        self.assertEqual(header["number_of_reads"], 0)
        self.assertEqual(len(reads), 0)
        app_results.cleanUp()
Example #28
0
def set_sff_trimpoints(sff_dir, technical_lengths):
    """Set trimpoints to end of technical read for all SFF files in directory.
    """
    for lib_id, sff_fp in get_per_lib_sff_fps(sff_dir):
        try:
            readlength = technical_lengths[lib_id]
        except KeyError:
            continue
        sff_data = parse_binary_sff(open(sff_fp), True)
        clipped_header, clipped_reads = set_clip_qual_left(sff_data, readlength)
        
        _, temp_fp = tempfile.mkstemp(dir=sff_dir)
        with open(temp_fp, 'w') as f:
            write_binary_sff(f, clipped_header, clipped_reads)

        shutil.move(temp_fp, sff_fp)
Example #29
0
    def test_write_binary_sff(self):
        read = READ_HEADER.copy()
        read.update(READ_DATA)

        header = COMMON_HEADER.copy()
        header['number_of_reads'] = 1

        write_binary_sff(self.output_file, header, [read])

        file_pos = self.output_file.tell()
        self.assertTrue(file_pos % 8 == 0)

        self.output_file.seek(0)
        observed_header, observed_reads = parse_binary_sff(
            self.output_file, native_flowgram_values=True)
        observed_reads = list(observed_reads)
        self.assertEqual(observed_header, header)
        self.assertEqual(observed_reads[0], read)
        self.assertEqual(len(observed_reads), 1)

        file_pos = self.output_file.tell()
        self.assertTrue(file_pos % 8 == 0)
Example #30
0
    def test_write_binary_sff(self):
        read = READ_HEADER.copy()
        read.update(READ_DATA)

        header = COMMON_HEADER.copy()
        header['number_of_reads'] = 1

        write_binary_sff(self.output_file, header, [read])

        file_pos = self.output_file.tell()
        self.assertTrue(file_pos % 8 == 0)

        self.output_file.seek(0)
        observed_header, observed_reads = parse_binary_sff(
            self.output_file, native_flowgram_values=True)
        observed_reads = list(observed_reads)
        self.assertEqual(observed_header, header)
        self.assertEqual(observed_reads[0], read)
        self.assertEqual(len(observed_reads), 1)

        file_pos = self.output_file.tell()
        self.assertTrue(file_pos % 8 == 0)
Example #31
0
#fn = binary_sff('testdata/sff_reads_1050.sff')
#seqs = LoadSeqs(fn, moltype=DNA, aligned=False)
#print seqs
import qiime.split_libraries
from cogent import LoadSeqs, DNA
from cogent.parse.binary_sff import ( 
    seek_pad, parse_common_header, parse_read_header, parse_read_data,
    validate_common_header, parse_read, parse_binary_sff, UnsupportedSffError,
    write_pad, write_common_header, write_read_header, write_read_data,    
    parse_binary_sff, write_binary_sff 
    ) 
sff_in = open("testdata/sff_reads_1050.sff") 
#sff_out = open("filtered.sff", "wb") 
# Returns generator of reads 
header, reads = parse_binary_sff(sff_in, native_flowgram_values=True) 
aln = LoadSeqs(data=reads)
#header, reads = parse_read(sff_in, native_flowgram_values=True) 
for read in reads:
    print read["Name"], read["Bases"]
# Force evaluation of reads 
reads = [r for r in reads if r["number_of_bases"] > 504] 
# Adjust number of reads in SFF header 
header['number_of_reads'] = len(reads) 
# No index written by write_binary_sff 
header['index_offset'] = 0 
header['index_length'] = 0 
#write_binary_sff(sff_out, header, reads) 
sff_in.close() 
#sff_out.close()