Esempio n. 1
0
def match_pairs(reads,
                out_fhand,
                orphan_out_fhand,
                out_format,
                ordered=True,
                check_order_buffer_size=0,
                max_reads_memory=None,
                temp_dir=None):
    '''It matches the seq pairs in an iterator and splits the orphan seqs.'''
    counts = 0
    check_order_buffer = KeyedSet()
    for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory,
                                       temp_dir):
        if len(pair) == 1:
            write_seqs(pair, orphan_out_fhand, out_format)
            try:
                name = _parse_pair_direction_and_name(pair[0])[0]
            except PairDirectionError:
                name = get_name(pair[0])
            if ordered and counts < check_order_buffer_size:
                counts += 1
                if not check_order_buffer.check_add(name):
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
            elif ordered and counts >= check_order_buffer_size:
                if name in check_order_buffer:
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
        elif len(pair) == 2:
            write_seqs(pair, out_fhand, out_format)
    flush_fhand(orphan_out_fhand)
    flush_fhand(out_fhand)
Esempio n. 2
0
def match_pairs(reads, out_fhand, orphan_out_fhand, out_format, ordered=True,
                check_order_buffer_size=0, max_reads_memory=None,
                temp_dir=None):
    '''It matches the seq pairs in an iterator and splits the orphan seqs.'''
    counts = 0
    check_order_buffer = KeyedSet()
    for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory,
                                       temp_dir):
        if len(pair) == 1:
            write_seqs(pair, orphan_out_fhand, out_format)
            try:
                name = _parse_pair_direction_and_name(pair[0])[0]
            except PairDirectionError:
                name = get_name(pair[0])
            if ordered and counts < check_order_buffer_size:
                counts += 1
                if not check_order_buffer.check_add(name):
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
            elif ordered and counts >= check_order_buffer_size:
                if name in check_order_buffer:
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
        elif len(pair) == 2:
            write_seqs(pair, out_fhand, out_format)
    flush_fhand(orphan_out_fhand)
    flush_fhand(out_fhand)
Esempio n. 3
0
def _test_filter_duplicates(paired_reads, n_seqs_packet):
    assert isinstance(n_seqs_packet, int) or n_seqs_packet == None
    in_fhand = NamedTemporaryFile()
    fastq_with_dups = (FASTQ_NO_DUPS1 + FASTQ_DUPS + FASTQ_NO_DUPS2
                       + FASTQ_NO_DUPS3)
    in_fhand.write(fastq_with_dups)
    in_fhand.flush()
    in_fhand = open(in_fhand.name)
    out_fhand = NamedTemporaryFile()
    filter_duplicates([in_fhand], out_fhand, paired_reads, n_seqs_packet)
    flush_fhand(out_fhand)
    filtered_pairs = list(_read_pairs([open(out_fhand.name)],
                                      paired_reads))
    fastq_no_dups = FASTQ_NO_DUPS1 + FASTQ_NO_DUPS2 + FASTQ_NO_DUPS3
    expected_pairs = list(_read_pairs([StringIO(fastq_no_dups)],
                                        paired_reads))
    #print 'filtered_pairs ->', filtered_pairs
    #print 'expected_pairs ->', expected_pairs
    #print len(filtered_pairs), len(expected_pairs)
    #assert len(filtered_pairs) == len(expected_pairs)
    for pair1 in expected_pairs:
        counts = 0
        for pair2 in filtered_pairs:
            if _seqitem_pairs_equal(pair1, pair2):
                counts += 1
        assert counts == 1
    in_fhand.close()
Esempio n. 4
0
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    buf_fwd = {'index': {}, 'items': []}
    buf_rev = {'index': {}, 'items': []}
    buf1, buf2 = buf_fwd, buf_rev   # for the all orphan case
    for seq in seqs:
        try:
            seq_name, direction = _parse_pair_direction_and_name(seq)
        except PairDirectionError:
            write_seqs([seq], orphan_out_fhand, out_format)
            continue

        if direction == FWD:
            buf1 = buf_rev
            buf2 = buf_fwd
        else:
            buf1 = buf_fwd
            buf2 = buf_rev

        try:
            matching_seq_index = buf1['index'][seq_name]
        except KeyError:
            matching_seq_index = None

        if matching_seq_index is None:
            # add to buff
            buf2['items'].append(seq)
            buf2['index'][seq_name] = len(buf2['items']) - 1
            # check mem limit
            sum_items = len(buf1['items'] + buf2['items'])
            if memory_limit is not None and sum_items >= memory_limit:
                error_msg = 'There are too many consecutive non matching seqs'
                error_msg += ' in your input. We have reached the memory limit'
                raise MaxNumReadsInMem(error_msg)
        else:
            # write seqs from buffer1
            orphan_seqs = buf1['items'][:matching_seq_index]
            matching_seq = buf1['items'][matching_seq_index]
            write_seqs(orphan_seqs, orphan_out_fhand, out_format)
            write_seqs([matching_seq, seq], out_fhand, out_format)
            # fix buffers 1
            buf1['items'] = buf1['items'][matching_seq_index + 1:]
            buf1['index'] = {s: i for i, s in enumerate(buf1['items'])}

            # writes seqs from buffer 2 and fix buffer2
            write_seqs(buf2['items'], orphan_out_fhand, out_format)
            buf2['items'] = []
            buf2['index'] = {}
    else:
        orphan_seqs = buf1['items'] + buf2['items']
        write_seqs(orphan_seqs, orphan_out_fhand, out_format)

    orphan_out_fhand.flush()
    flush_fhand(out_fhand)
Esempio n. 5
0
def _test_filter_duplicates(paired_reads, n_seqs_packet):
    assert isinstance(n_seqs_packet, int) or n_seqs_packet == None
    in_fhand = NamedTemporaryFile()
    fastq_with_dups = (FASTQ_NO_DUPS1 + FASTQ_DUPS + FASTQ_NO_DUPS2 +
                       FASTQ_NO_DUPS3)
    in_fhand.write(fastq_with_dups)
    in_fhand.flush()
    in_fhand = open(in_fhand.name)
    out_fhand = NamedTemporaryFile()
    filter_duplicates([in_fhand], out_fhand, paired_reads, n_seqs_packet)
    flush_fhand(out_fhand)
    filtered_pairs = list(_read_pairs([open(out_fhand.name)], paired_reads))
    fastq_no_dups = FASTQ_NO_DUPS1 + FASTQ_NO_DUPS2 + FASTQ_NO_DUPS3
    expected_pairs = list(_read_pairs([StringIO(fastq_no_dups)], paired_reads))
    #print 'filtered_pairs ->', filtered_pairs
    #print 'expected_pairs ->', expected_pairs
    #print len(filtered_pairs), len(expected_pairs)
    #assert len(filtered_pairs) == len(expected_pairs)
    for pair1 in expected_pairs:
        counts = 0
        for pair2 in filtered_pairs:
            if _seqitem_pairs_equal(pair1, pair2):
                counts += 1
        assert counts == 1
    in_fhand.close()

    # use length
    in_fhand = NamedTemporaryFile()
    in_fhand.write(FASTQ_DUPS)
    in_fhand.flush()
    in_fhand = open(in_fhand.name)

    out_fhand = NamedTemporaryFile()
    filter_duplicates([in_fhand],
                      out_fhand,
                      paired_reads=False,
                      n_seqs_packet=n_seqs_packet,
                      use_length=10)
    flush_fhand(out_fhand)
    filtered_pairs = list(
        _read_pairs([open(out_fhand.name)], paired_reads=False))
    assert len(filtered_pairs) == 2

    # use length
    in_fhand = NamedTemporaryFile()
    in_fhand.write(FASTQ_DUPS)
    in_fhand.flush()
    in_fhand = open(in_fhand.name)

    out_fhand = NamedTemporaryFile()
    filter_duplicates([in_fhand],
                      out_fhand,
                      paired_reads=False,
                      n_seqs_packet=n_seqs_packet,
                      use_length=1)
    flush_fhand(out_fhand)
    filtered_pairs = list(
        _read_pairs([open(out_fhand.name)], paired_reads=False))
    assert len(filtered_pairs) == 1
Esempio n. 6
0
        if copy_if_same_format:
            copyfileobj(in_fhands[0], out_fhand)
        else:
            rel_symlink(in_fhands[0].name, out_fhand.name)
    else:
        seqs = _read_seqrecords(in_fhands)
        try:
            write_seqrecs(seqs, out_fhand, out_format)
        except ValueError, error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            if 'No suitable quality scores' in str(error):
                msg = 'No qualities available to write output file'
                raise IncompatibleFormatError(msg)
            raise
    flush_fhand(out_fhand)


def fastaqual_to_fasta(seq_fhand, qual_fhand, out_fhand):
    'It converts a fasta and a qual file into a fastq format file'
    seqrecords = PairedFastaQualIterator(seq_fhand, qual_fhand)
    try:
        write_seqrecs(seqrecords, out_fhand.name, 'fastq')
    except ValueError, error:
        if error_quality_disagree(error):
            raise MalformedFile(str(error))
        raise
    out_fhand.flush()


def guess_seq_type(fhand):
Esempio n. 7
0
 def flush(self):
     flush_fhand(self.stream)
Esempio n. 8
0
        if copy_if_same_format:
            copyfileobj(in_fhands[0], out_fhand)
        else:
            rel_symlink(in_fhands[0].name, out_fhand.name)
    else:
        seqs = _read_seqrecords(in_fhands)
        try:
            write_seqrecs(seqs, out_fhand, out_format)
        except ValueError, error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            if 'No suitable quality scores' in str(error):
                msg = 'No qualities available to write output file'
                raise IncompatibleFormatError(msg)
            raise
    flush_fhand(out_fhand)


def fastaqual_to_fasta(seq_fhand, qual_fhand, out_fhand):
    'It converts a fasta and a qual file into a fastq format file'
    seqrecords = PairedFastaQualIterator(seq_fhand, qual_fhand)
    try:
        write_seqrecs(seqrecords, out_fhand.name, 'fastq')
    except ValueError, error:
        if error_quality_disagree(error):
            raise MalformedFile(str(error))
        raise
    out_fhand.flush()


def guess_seq_type(fhand):
Esempio n. 9
0
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    buf_fwd = {'index': {}, 'items': []}
    buf_rev = {'index': {}, 'items': []}
    buf1, buf2 = buf_rev, buf_fwd  # for the all orphan case
    for seq in seqs:
        try:
            seq_name, direction = _parse_pair_direction_and_name(seq)
        except PairDirectionError:
            write_seqs([seq], orphan_out_fhand, out_format)
            continue

        # buf1 -> buffer for the reads with the same orientation as the
        # current one
        # buf2 -> buffer for the reads with the reverse orientation as the
        # current one

        if direction == FWD:
            buf1 = buf_fwd
            buf2 = buf_rev
        else:
            buf1 = buf_rev
            buf2 = buf_fwd

        try:
            matching_seq_index = buf2['index'][seq_name]
        except KeyError:
            matching_seq_index = None

        if matching_seq_index is None:
            # add to buff
            buf1['items'].append(seq)
            buf1['index'][seq_name] = len(buf1['items']) - 1
            # check mem limit
            sum_items = len(buf2['items'] + buf1['items'])
            if memory_limit is not None and sum_items >= memory_limit:
                error_msg = 'There are too many consecutive non matching seqs'
                error_msg += ' in your input. We have reached the memory limit.'
                error_msg += 'Are you sure that the reads are sorted and '
                error_msg += 'interleaved?. You could try with the unordered'
                error_msg += ' algorith'
                raise MaxNumReadsInMem(error_msg)
        else:
            # write seqs from buffer1
            orphan_seqs = buf2['items'][:matching_seq_index]
            matching_seq = buf2['items'][matching_seq_index]
            write_seqs(orphan_seqs, orphan_out_fhand, out_format)
            write_seqs([matching_seq, seq], out_fhand, out_format)
            # fix buffer 1
            if matching_seq_index != len(buf2['items']) - 1:
                msg = 'The given files are not sorted (ordered) and '
                msg = 'interleaved. You could try with the unordered algorithm'
                raise MalformedFile(msg)
            buf2 = {'index': {}, 'items': []}
            # writes seqs from buffer 2 and fix buffer2
            write_seqs(buf1['items'], orphan_out_fhand, out_format)
            buf1 = {'index': {}, 'items': []}

        if direction == FWD:
            buf_fwd = buf1
            buf_rev = buf2
        else:
            buf_rev = buf1
            buf_fwd = buf2

    else:
        orphan_seqs = buf1['items'] + buf2['items']
        write_seqs(orphan_seqs, orphan_out_fhand, out_format)

    orphan_out_fhand.flush()
    flush_fhand(out_fhand)
Esempio n. 10
0
def match_pairs(seqs,
                out_fhand,
                orphan_out_fhand,
                out_format,
                memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    buf_fwd = {'index': {}, 'items': []}
    buf_rev = {'index': {}, 'items': []}
    buf1, buf2 = buf_rev, buf_fwd  # for the all orphan case
    for seq in seqs:
        try:
            seq_name, direction = _parse_pair_direction_and_name(seq)
        except PairDirectionError:
            write_seqs([seq], orphan_out_fhand, out_format)
            continue

        # buf1 -> buffer for the reads with the same orientation as the
        # current one
        # buf2 -> buffer for the reads with the reverse orientation as the
        # current one

        if direction == FWD:
            buf1 = buf_fwd
            buf2 = buf_rev
        else:
            buf1 = buf_rev
            buf2 = buf_fwd

        try:
            matching_seq_index = buf2['index'][seq_name]
        except KeyError:
            matching_seq_index = None

        if matching_seq_index is None:
            # add to buff
            buf1['items'].append(seq)
            buf1['index'][seq_name] = len(buf1['items']) - 1
            # check mem limit
            sum_items = len(buf2['items'] + buf1['items'])
            if memory_limit is not None and sum_items >= memory_limit:
                error_msg = 'There are too many consecutive non matching seqs'
                error_msg += ' in your input. We have reached the memory limit.'
                error_msg += 'Are you sure that the reads are sorted and '
                error_msg += 'interleaved?. You could try with the unordered'
                error_msg += ' algorith'
                raise MaxNumReadsInMem(error_msg)
        else:
            # write seqs from buffer1
            orphan_seqs = buf2['items'][:matching_seq_index]
            matching_seq = buf2['items'][matching_seq_index]
            write_seqs(orphan_seqs, orphan_out_fhand, out_format)
            write_seqs([matching_seq, seq], out_fhand, out_format)
            # fix buffer 1
            if matching_seq_index != len(buf2['items']) - 1:
                msg = 'The given files are not sorted (ordered) and '
                msg = 'interleaved. You could try with the unordered algorithm'
                raise MalformedFile(msg)
            buf2 = {'index': {}, 'items': []}

            # writes seqs from buffer 2 and fix buffer2
            write_seqs(buf1['items'], orphan_out_fhand, out_format)
            buf1['items'] = []
            buf1['index'] = {}
    else:
        orphan_seqs = buf1['items'] + buf2['items']
        write_seqs(orphan_seqs, orphan_out_fhand, out_format)

    orphan_out_fhand.flush()
    flush_fhand(out_fhand)