Beispiel #1
0
def test_open_file_xz(tmpdir):
    test_string = b'test\n'
    file_name = tmpdir.join('test-open.xz').strpath
    handle = open_file(file_name, mode='w')
    handle.write(test_string)
    handle.close()
    assert open_file(file_name, mode='r').read() == test_string
Beispiel #2
0
def test_open_file_text(tmpdir):
    test_string = 'test\n'
    file_name = tmpdir.join('test-open').strpath
    handle = open_file(file_name, mode='wb')
    handle.write(test_string.encode('ascii'))
    handle.close()
    assert open_file(file_name,
                     mode='rb').read().decode('ascii') == test_string
Beispiel #3
0
def test_write_fasta_sequence1(nucseq, tmpdir):

    seq_id, seq = next(fasta.load_fasta(nucseq))
    file_name = (tmpdir / 'test.fa').strpath
    file_handle = open_file(file_name, 'w')

    fasta.write_fasta_sequence(file_handle, seq_id, seq)
    file_handle.close()

    seq_idw, seqw = next(fasta.load_fasta(file_name))

    assert (seq_id, seq) == (seq_idw, seqw)
Beispiel #4
0
def test_write_fasta_sequence2(nucseq, tmpdir):

    file_name = (tmpdir / 'test.fa').strpath
    file_handle = open_file(file_name, 'w')

    for seq_id, seq in fasta.load_fasta(nucseq):
        fasta.write_fasta_sequence(file_handle, seq_id, seq)
    file_handle.close()

    count1 = sum(1 for x in fasta.load_fasta(nucseq))
    count2 = sum(1 for x in fasta.load_fasta(file_name))

    assert count1 == count2
Beispiel #5
0
def read_samtools_depth(file_handle, num_seqs=10000, seq_ids=None):
    """
    .. versionchanged:: 0.4.2
        the function returns **lists** instead of numpy arrays for speed (at
        least in my tests it seems ~4x increase)

    .. versionchanged:: 0.4.0
        now returns 3 array, instead of 2. Also added *seq_ids* to skip lines

    .. versionchanged:: 0.3.4
        *num_seqs* can be None to avoid a log message

    .. versionadded:: 0.3.0

    Reads a samtools *depth* file, returning a generator that yields the
    array of each base coverage on a per-sequence base.

    .. note::

        There's no need anymore to use `samtools depth -aa`, because the
        function returns the position array and this can be used to create a
        Pandas SparseArray which can be reindexed to include missing positions
        (with values of 0)

        **Valid for version < 0.4.0**:

        The information on position is not used, to use numpy and save memory.
        samtools *depth* should be called with the `-aa` option::

             `samtools depth -aa bamfile`

        This options will output both base position with 0 coverage and
        sequneces with no aligned reads

    Arguments:
        file_handle (file): file handle of the coverage file
        num_seqs (int or None): number of sequence that fires a log message. If
            None, no message is triggered
        seq_ids (dict, set): a hashed container like a dictionary or set with
            the sequences to return

    Yields:
        tuple: the first element is the sequence identifier, the second one
        is the list with the positions, the third element is the list with the
        coverages
    """
    curr_key = ''
    curr_pos = []
    curr_cov = []

    file_handle = open_file(file_handle, 'rb')

    LOG.info(
        'Reading coverage from file (%s)',
        getattr(file_handle, 'name', repr(file_handle))
    )
    line_no = 0
    for line in file_handle:
        line = line.decode('ascii')
        # From Python3 the default is Universal newlines, and it's not expected
        # to have more than '\n' at the end of the line - increases speed
        # slightly
        name, pos, cov = line[:-1].split('\t')
        if (seq_ids is not None) and (name not in seq_ids):
            continue
        # only converts if sequence is to be used
        pos = int(pos)
        cov = int(cov)

        if curr_key == name:
                curr_pos.append(pos)
                curr_cov.append(cov)
        else:
            if curr_key == '':
                curr_cov.append(cov)
                curr_pos.append(pos)
                curr_key = name
            else:
                line_no += 1
                if (num_seqs is not None) and (line_no % num_seqs == 0):
                    LOG.info('Read %d sequence coverage', line_no)
                yield curr_key, curr_pos, curr_cov
                curr_key = name
                curr_cov = [cov]
                curr_cov = [pos]
    else:
        yield curr_key, curr_pos, curr_cov

    LOG.info('Read a total of %d sequence coverage', line_no + 1)