Exemple #1
0
def get_str_qualities(seq, out_format=None):
    if out_format is None:
        out_format = seq.file_format
    out_format = remove_multiline(out_format)
    if out_format in SANGER_FASTQ_FORMATS:
        out_format = SANGER_QUALITY
    elif out_format in ILLUMINA_FASTQ_FORMATS:
        out_format = ILLUMINA_QUALITY

    seq_class = seq.kind
    if seq_class == SEQITEM:
        in_format = remove_multiline(seq.file_format)
        if 'fasta' in in_format:
            raise ValueError('A fasta file has no qualities')
        if in_format in SANGER_FASTQ_FORMATS:
            in_format = SANGER_QUALITY
        elif in_format in ILLUMINA_FASTQ_FORMATS:
            in_format = ILLUMINA_QUALITY
        else:
            msg = 'Unknown or not supported quality format: '
            msg += in_format
            raise ValueError(msg)
        if in_format == out_format:
            quals = ''.join(line.rstrip() for line in _get_seqitem_qual_lines(seq))
        else:
            int_quals = get_int_qualities(seq)
            quals = ''.join(_int_quals_to_str_quals(int_quals, out_format))
    elif seq_class == SEQRECORD:
        int_quals = get_int_qualities(seq)
        quals = ''.join(_int_quals_to_str_quals(int_quals, out_format))
    return quals
Exemple #2
0
def _write_filter_trim_packets(passed_fhand, diverted_fhand, packets,
                               file_format='fastq', workers=None,
                               seqs_diverted=SEQS_FILTERED_OUT):
    'It writes the filter stream into passed and filtered out sequence files'

    file_format = remove_multiline(file_format)

    if diverted_fhand is None:
        seq_packets = (p[SEQS_PASSED] for p in packets)
        seqs = (s for pair in chain.from_iterable(seq_packets) for s in pair)
        try:
            return write_seqs(seqs, passed_fhand, file_format=file_format)
        except BaseException:
            if workers is not None:
                workers.terminate()
            raise

    flatten_pairs = lambda pairs: (seq for pair in pairs for seq in pair)
    for packet in packets:
        try:
            write_seqs(flatten_pairs(packet[SEQS_PASSED]), fhand=passed_fhand,
                       file_format=file_format)
            # if diverted seqs are filtered aout they are a list of list
            # as not diverted seqs.
            # if they are orphan, they are a list of seqs
            if seqs_diverted == SEQS_FILTERED_OUT :
                seqs = flatten_pairs(packet[seqs_diverted])
            else:
                seqs = packet[seqs_diverted]
            write_seqs(seqs, fhand=diverted_fhand, file_format=file_format)

        except BaseException:
            if workers is not None:
                workers.terminate()
            raise
Exemple #3
0
def _read_seqrecords(fhands, file_format=GUESS_FORMAT):
    'It returns an iterator of seqrecords'
    seq_iters = []
    for fhand in fhands:
        if file_format == GUESS_FORMAT or file_format is None:
            fmt = guess_format(fhand)
        else:
            fmt = file_format

        fmt = remove_multiline(fmt)

        if fmt in ('fasta', 'qual') or 'fastq' in fmt:
            title = title2ids
        if fmt == 'fasta':
            seq_iter = FastaIterator(fhand, title2ids=title)
        elif fmt == 'qual':
            seq_iter = QualPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq' or fmt == 'fastq-sanger':
            seq_iter = FastqPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq-solexa':
            seq_iter = FastqSolexaIterator(fhand, title2ids=title)
        elif fmt == 'fastq-illumina':
            seq_iter = FastqIlluminaIterator(fhand, title2ids=title)
        else:
            seq_iter = parse_into_seqrecs(fhand, fmt)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Exemple #4
0
def write_filter_packets(passed_fhand, filtered_fhand, filter_packets,
                         file_format='fastq', workers=None):
    'It writes the filter stream into passed and filtered out sequence files'
    file_format = remove_multiline(file_format)

    if filtered_fhand is None:
        seq_packets = (p[SEQS_PASSED] for p in filter_packets)
        seqs = (s for pair in chain.from_iterable(seq_packets) for s in pair)
        try:
            return write_seqs(seqs, passed_fhand, file_format=file_format)
        except BaseException:
            if workers is not None:
                workers.terminate()
            raise

    flatten_pairs = lambda pairs: (seq for pair in pairs for seq in pair)
    for packet in filter_packets:
        try:
            write_seqs(flatten_pairs(packet[SEQS_PASSED]), fhand=passed_fhand,
                       file_format=file_format)
            write_seqs(flatten_pairs(packet[SEQS_FILTERED_OUT]),
                       fhand=filtered_fhand, file_format=file_format)
        except BaseException:
            if workers is not None:
                workers.terminate()
            raise
Exemple #5
0
def _read_seqrecords(fhands, file_format=GUESS_FORMAT):
    'It returns an iterator of seqrecords'
    seq_iters = []
    for fhand in fhands:
        if file_format == GUESS_FORMAT or file_format is None:
            fmt = guess_format(fhand)
        else:
            fmt = file_format

        fmt = remove_multiline(fmt)

        if fmt in ('fasta', 'qual') or 'fastq' in fmt:
            title = title2ids
        if fmt == 'fasta':
            seq_iter = FastaIterator(fhand, title2ids=title)
        elif fmt == 'qual':
            seq_iter = QualPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq' or fmt == 'fastq-sanger':
            seq_iter = FastqPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq-solexa':
            seq_iter = FastqSolexaIterator(fhand, title2ids=title)
        elif fmt == 'fastq-illumina':
            seq_iter = FastqIlluminaIterator(fhand, title2ids=title)
        else:
            seq_iter = parse_into_seqrecs(fhand, fmt)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Exemple #6
0
def _index_seq_file(fpath, file_format=None):
    '''It indexes a seq file using Biopython index.

    It uses the title line line as the key and not just the id.
    '''
    if file_format is None:
        file_format = guess_format(open(fpath))

    file_format = remove_multiline(file_format)

    # pylint: disable W0212
    # we monkey patch to be able to index using the whole tile line and not
    # only the id. We need it because in a pair end file sequences with the
    # same id could be found
    accessor = _index._FormatToRandomAccess
    old_accessor = accessor.copy()
    accessor['fastq'] = FastqRandomAccess
    accessor['astq-sanger'] = FastqRandomAccess
    accessor['fastq-solexa'] = FastqRandomAccess
    accessor['fastq-illumina'] = FastqRandomAccess

    file_index = index(fpath, format=file_format)

    _index._FormatToRandomAccess = old_accessor

    return file_index
Exemple #7
0
def _index_seq_file(fpath, file_format=None):
    '''It indexes a seq file using Biopython index.

    It uses the title line line as the key and not just the id.
    '''
    if file_format is None:
        file_format = guess_format(open(fpath))

    file_format = remove_multiline(file_format)

    # pylint: disable W0212
    # we monkey patch to be able to index using the whole tile line and not
    # only the id. We need it because in a pair end file sequences with the
    # same id could be found
    accessor = _index._FormatToRandomAccess
    old_accessor = accessor.copy()
    accessor['fastq'] = FastqRandomAccess
    accessor['astq-sanger'] = FastqRandomAccess
    accessor['fastq-solexa'] = FastqRandomAccess
    accessor['fastq-illumina'] = FastqRandomAccess

    file_index = index(fpath, format=file_format)

    _index._FormatToRandomAccess = old_accessor

    return file_index
Exemple #8
0
def write_filter_packets(passed_fhand,
                         filtered_fhand,
                         filter_packets,
                         file_format='fastq',
                         workers=None):
    'It writes the filter stream into passed and filtered out sequence files'
    file_format = remove_multiline(file_format)

    if filtered_fhand is None:
        seq_packets = (p[SEQS_PASSED] for p in filter_packets)
        seqs = (s for pair in chain.from_iterable(seq_packets) for s in pair)
        try:
            return write_seqs(seqs, passed_fhand, file_format=file_format)
        except BaseException:
            if workers is not None:
                workers.terminate()
            raise

    flatten_pairs = lambda pairs: (seq for pair in pairs for seq in pair)
    for packet in filter_packets:
        try:
            write_seqs(flatten_pairs(packet[SEQS_PASSED]),
                       fhand=passed_fhand,
                       file_format=file_format)
            write_seqs(flatten_pairs(packet[SEQS_FILTERED_OUT]),
                       fhand=filtered_fhand,
                       file_format=file_format)
        except BaseException:
            if workers is not None:
                workers.terminate()
            raise
Exemple #9
0
def slice_seq(seq, start=None, stop=None):
    seq_class = seq.kind
    if seq_class == SEQITEM:
        seq_obj = _slice_seqitem(seq, start, stop)
    elif seq_class == SEQRECORD:
        seq_obj = seq.object[start:stop]
    return SeqWrapper(seq.kind, object=seq_obj,
                      file_format=remove_multiline(seq.file_format))
Exemple #10
0
def write_seq_packets(fhand, seq_packets, file_format='fastq', workers=None):
    'It writes to file a stream of seq lists'
    file_format = remove_multiline(file_format)
    try:
        write_seqs(chain.from_iterable(seq_packets), fhand,
                   file_format=file_format)
    except BaseException:
        if workers is not None:
            workers.terminate()
        raise
Exemple #11
0
def write_seq_packets(fhand, seq_packets, file_format='fastq', workers=None):
    'It writes to file a stream of seq lists'
    file_format = remove_multiline(file_format)
    try:
        write_seqs(chain.from_iterable(seq_packets),
                   fhand,
                   file_format=file_format)
    except BaseException:
        if workers is not None:
            workers.terminate()
        raise
Exemple #12
0
def _write_seqrecords(seqs, fhand=None, file_format='fastq'):
    'It writes a stream of sequences to a file'
    file_format = remove_multiline(file_format)

    if fhand is None:
        fhand = NamedTemporaryFile(suffix='.' + file_format.replace('-', '_'))
    seqs = _clean_seqrecord_stream(seqs)
    try:
        write_seqrecs(seqs, fhand, file_format)
    except IOError, error:
        # The pipe could be already closed
        if not 'Broken pipe' in str(error):
            raise
Exemple #13
0
def _write_seqrecords(seqs, fhand=None, file_format='fastq'):
    'It writes a stream of sequences to a file'
    file_format = remove_multiline(file_format)

    if fhand is None:
        fhand = NamedTemporaryFile(suffix='.' + file_format.replace('-', '_'))
    seqs = _clean_seqrecord_stream(seqs)
    try:
        write_seqrecs(seqs, fhand, file_format)
    except IOError, error:
        # The pipe could be already closed
        if not 'Broken pipe' in str(error):
            raise
Exemple #14
0
def _write_seqitems(items, fhand, file_format):
    'It writes one seq item (tuple of name and string)'
    for seq in items:
        seqitems_fmt = remove_multiline(seq.file_format)
        if file_format and 'fastq' in seqitems_fmt and 'fasta' in file_format:
            seq_lines = seq.object.lines
            try:
                fhand.write('>' + seq_lines[0][1:] + seq_lines[1])
            except IOError, error:
                # The pipe could be already closed
                if not 'Broken pipe' in str(error):
                    raise
        elif file_format and seqitems_fmt != file_format:
            msg = 'Input and output file formats do not match, you should not '
            msg += 'use SeqItems: ' + str(seq.file_format) + ' '
            msg += str(file_format)
            raise RuntimeError(msg)
Exemple #15
0
def _write_seqitems(items, fhand, file_format):
    'It writes one seq item (tuple of name and string)'
    for seq in items:
        seqitems_fmt = remove_multiline(seq.file_format)
        if file_format and 'fastq' in seqitems_fmt and 'fasta' in file_format:
            seq_lines = seq.object.lines
            try:
                fhand.write('>' + seq_lines[0][1:] + seq_lines[1])
            except IOError, error:
                # The pipe could be already closed
                if not 'Broken pipe' in str(error):
                    raise
        elif file_format and seqitems_fmt != file_format:
            msg = 'Input and output file formats do not match, you should not '
            msg += 'use SeqItems: ' + str(seq.file_format) + ' '
            msg += str(file_format)
            raise RuntimeError(msg)
Exemple #16
0
def _copy_seqitem(seqwrapper, seq=None, name=None):
    seq_item = seqwrapper.object
    lines = seq_item.lines
    fmt = seqwrapper.file_format
    if seq is None:
        lines = lines[:]
    else:
        if 'fasta' in fmt:
            lines = [lines[0], seq + '\n']
        elif 'multiline' in fmt and 'fastq' in fmt:
            qline = ''.join([qline.strip() for qline in _get_seqitem_qual_lines(seqwrapper)])
            lines = [lines[0], seq + '\n', '+\n', qline + '\n']
            fmt = remove_multiline(fmt)
            if len(lines[1]) != len(lines[3]):
                msg = 'Sequence and quality line length do not match'
                raise ValueError(msg)
        elif 'fastq' in fmt:
            lines = [lines[0], seq + '\n', lines[2], lines[3]]
            if len(lines[1]) != len(lines[3]):
                msg = 'Sequence and quality line length do not match'
                raise ValueError(msg)
        else:
            raise RuntimeError('Unknown format for a SequenceItem')

    if name:
        lines[0] = lines[0][0] + name + '\n'
        if 'fastq' in fmt:
            if 'multiline' in fmt:
                line_plus_index = _sitem_fastq_plus_line_index
            else:
                line_plus_index = 2
            lines[line_plus_index] = '+\n'
    name = seq_item.name if name is None else name

    annotations = seq_item.annotations
    if annotations is not None:
        annotations = annotations.copy()
    seq = SeqWrapper(kind=seqwrapper.kind,
                     object=SeqItem(name, lines, annotations),
                     file_format=fmt)
    return seq
Exemple #17
0
def write_seqs(seqs, fhand=None, file_format=None):
    'It writes the given sequences'
    if fhand is None:
        fhand = NamedTemporaryFile(suffix='.' + file_format.replace('-', '_'))

    file_format = remove_multiline(file_format)
    seqs, seqs2 = tee(seqs)
    try:
        seq = seqs2.next()
    except StopIteration:
        # No sequences to write, so we're done
        return fhand
    del seqs2
    seq_class = seq.kind
    if seq_class == SEQITEM:
        _write_seqitems(seqs, fhand, file_format)
    elif seq_class == SEQRECORD:
        seqs = (seq.object for seq in seqs)
        _write_seqrecords(seqs, fhand, file_format)
    else:
        raise ValueError('Unknown class for seq: ' + seq_class)
    return fhand
Exemple #18
0
def write_seqs(seqs, fhand=None, file_format=None):
    'It writes the given sequences'
    if fhand is None:
        fhand = NamedTemporaryFile(suffix='.' + file_format.replace('-', '_'))

    file_format = remove_multiline(file_format)
    seqs, seqs2 = tee(seqs)
    try:
        seq = seqs2.next()
    except StopIteration:
        # No sequences to write, so we're done
        return fhand
    del seqs2
    seq_class = seq.kind
    if seq_class == SEQITEM:
        _write_seqitems(seqs, fhand, file_format)
    elif seq_class == SEQRECORD:
        seqs = (seq.object for seq in seqs)
        _write_seqrecords(seqs, fhand, file_format)
    else:
        raise ValueError('Unknown class for seq: ' + seq_class)
    return fhand
Exemple #19
0
def seqio(in_fhands, out_fhand, out_format, copy_if_same_format=True):
    'It converts sequence files between formats'
    if out_format not in get_setting('SUPPORTED_OUTPUT_FORMATS'):
        raise IncompatibleFormatError("This output format is not supported")

    in_formats = [remove_multiline(guess_format(fhand)) for fhand in in_fhands]

    if len(in_fhands) == 1 and in_formats[0] == out_format:
        if copy_if_same_format:
            copyfileobj(in_fhands[0], out_fhand)
        else:
            rel_symlink(in_fhands[0].name, out_fhand.name)
    else:
        seqs = _read_seqrecords(in_fhands)
        try:
            write_seqrecs(seqs, out_fhand, out_format)
        except ValueError, error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            if 'No suitable quality scores' in str(error):
                msg = 'No qualities available to write output file'
                raise IncompatibleFormatError(msg)
            raise
Exemple #20
0
def seqio(in_fhands, out_fhand, out_format, copy_if_same_format=True):
    'It converts sequence files between formats'
    if out_format not in get_setting('SUPPORTED_OUTPUT_FORMATS'):
        raise IncompatibleFormatError("This output format is not supported")

    in_formats = [remove_multiline(guess_format(fhand)) for fhand in in_fhands]

    if len(in_fhands) == 1 and in_formats[0] == out_format:
        if copy_if_same_format:
            copyfileobj(in_fhands[0], out_fhand)
        else:
            rel_symlink(in_fhands[0].name, out_fhand.name)
    else:
        seqs = _read_seqrecords(in_fhands)
        try:
            write_seqrecs(seqs, out_fhand, out_format)
        except ValueError, error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            if 'No suitable quality scores' in str(error):
                msg = 'No qualities available to write output file'
                raise IncompatibleFormatError(msg)
            raise