Esempio n. 1
0
def _format_fasta_like_records(generator, id_whitespace_replacement,
                               description_newline_replacement, require_qual,
                               lowercase=None):
    if ((id_whitespace_replacement is not None and
         '\n' in id_whitespace_replacement) or
        (description_newline_replacement is not None and
         '\n' in description_newline_replacement)):
        raise ValueError(
            "Newline character (\\n) cannot be used to replace whitespace in "
            "sequence IDs, nor to replace newlines in sequence descriptions.")

    for idx, seq in enumerate(generator):

        if len(seq) < 1:
            raise ValueError(
                "%s sequence does not contain any characters (i.e., it is an "
                "empty/blank sequence). Writing empty sequences is not "
                "supported." % cardinal_to_ordinal(idx + 1))

        if 'id' in seq.metadata:
            id_ = seq.metadata['id']
        else:
            id_ = ''

        if id_whitespace_replacement is not None:
            id_ = _whitespace_regex.sub(id_whitespace_replacement, id_)

        if 'description' in seq.metadata:
            desc = seq.metadata['description']
        else:
            desc = ''

        if description_newline_replacement is not None:
            desc = _newline_regex.sub(description_newline_replacement, desc)

        if desc:
            header = '%s %s' % (id_, desc)
        else:
            header = id_

        if require_qual and 'quality' not in seq.positional_metadata:
            raise ValueError(
                "Cannot write %s sequence because it does not have quality "
                "scores associated with it." % cardinal_to_ordinal(idx + 1))

        qual = None
        if 'quality' in seq.positional_metadata:
            qual = seq.positional_metadata['quality'].values

        if lowercase is not None:
            if hasattr(seq, 'lowercase'):
                seq_str = seq.lowercase(lowercase)
            else:
                raise AttributeError("lowercase specified but class %s does "
                                     "not support lowercase functionality" %
                                     seq.__class__.__name__)
        else:
            seq_str = str(seq)
        yield header, seq_str, qual
Esempio n. 2
0
 def test_valid_range(self):
     # taken and modified from http://stackoverflow.com/a/20007730/3776794
     exp = ['0th', '1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th',
            '9th', '10th', '11th', '12th', '13th', '14th', '15th', '16th',
            '17th', '18th', '19th', '20th', '21st', '22nd', '23rd', '24th',
            '25th', '26th', '27th', '28th', '29th', '30th', '31st', '32nd',
            '100th', '101st', '42042nd']
     obs = [cardinal_to_ordinal(n) for n in
            list(range(0, 33)) + [100, 101, 42042]]
     self.assertEqual(obs, exp)
Esempio n. 3
0
 def test_valid_range(self):
     # taken and modified from http://stackoverflow.com/a/20007730/3776794
     exp = ['0th', '1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th',
            '9th', '10th', '11th', '12th', '13th', '14th', '15th', '16th',
            '17th', '18th', '19th', '20th', '21st', '22nd', '23rd', '24th',
            '25th', '26th', '27th', '28th', '29th', '30th', '31st', '32nd',
            '100th', '101st', '42042nd']
     obs = [cardinal_to_ordinal(n) for n in
            list(range(0, 33)) + [100, 101, 42042]]
     self.assertEqual(obs, exp)
Esempio n. 4
0
def _format_fasta_like_records(generator, id_whitespace_replacement, description_newline_replacement, require_qual):
    if (id_whitespace_replacement is not None and "\n" in id_whitespace_replacement) or (
        description_newline_replacement is not None and "\n" in description_newline_replacement
    ):
        raise ValueError(
            "Newline character (\\n) cannot be used to replace whitespace in "
            "sequence IDs, nor to replace newlines in sequence descriptions."
        )

    for idx, seq in enumerate(generator):
        if len(seq) < 1:
            raise ValueError(
                "%s sequence does not contain any characters (i.e., it is an "
                "empty/blank sequence). Writing empty sequences is not "
                "supported." % cardinal_to_ordinal(idx + 1)
            )

        id_ = seq.id
        if id_whitespace_replacement is not None:
            id_ = _whitespace_regex.sub(id_whitespace_replacement, id_)

        desc = seq.description
        if description_newline_replacement is not None:
            desc = _newline_regex.sub(description_newline_replacement, desc)

        if desc:
            header = "%s %s" % (id_, desc)
        else:
            header = id_

        if require_qual and seq.quality is None:
            raise ValueError(
                "Cannot write %s sequence because it does not have quality "
                "scores associated with it." % cardinal_to_ordinal(idx + 1)
            )

        yield header, str(seq), seq.quality
Esempio n. 5
0
def _format_fasta_like_records(generator, id_whitespace_replacement,
                               description_newline_replacement, require_qual):
    if ((id_whitespace_replacement is not None and
         '\n' in id_whitespace_replacement) or
        (description_newline_replacement is not None and
         '\n' in description_newline_replacement)):
        raise ValueError(
            "Newline character (\\n) cannot be used to replace whitespace in "
            "sequence IDs, nor to replace newlines in sequence descriptions.")

    for idx, seq in enumerate(generator):
        if len(seq) < 1:
            raise ValueError(
                "%s sequence does not contain any characters (i.e., it is an "
                "empty/blank sequence). Writing empty sequences is not "
                "supported." % cardinal_to_ordinal(idx + 1))

        id_ = seq.id
        if id_whitespace_replacement is not None:
            id_ = _whitespace_regex.sub(id_whitespace_replacement, id_)

        desc = seq.description
        if description_newline_replacement is not None:
            desc = _newline_regex.sub(description_newline_replacement, desc)

        if desc:
            header = '%s %s' % (id_, desc)
        else:
            header = id_

        if require_qual and not seq.has_quality():
            raise ValueError(
                "Cannot write %s sequence because it does not have quality "
                "scores associated with it." % cardinal_to_ordinal(idx + 1))

        yield header, seq.sequence, seq.quality
Esempio n. 6
0
def _get_nth_sequence(generator, seq_num):
    # i is set to None so that an empty generator will not result in an
    # undefined variable when compared to seq_num.
    i = None
    if seq_num is None or seq_num < 1:
        raise ValueError('Invalid sequence number (`seq_num`=%s). `seq_num`'
                         ' must be between 1 and the number of sequences in'
                         ' the file.' % str(seq_num))
    try:
        for i, seq in zip(range(1, seq_num + 1), generator):
            pass
    finally:
        generator.close()

    if i == seq_num:
        return seq
    raise ValueError('Reached end of file before finding the %s sequence.'
                     % cardinal_to_ordinal(seq_num))
Esempio n. 7
0
def _get_nth_sequence(generator, seq_num):
    # i is set to None so that an empty generator will not result in an
    # undefined variable when compared to seq_num.
    i = None
    if seq_num is None or seq_num < 1:
        raise ValueError('Invalid sequence number (`seq_num`=%s). `seq_num`'
                         ' must be between 1 and the number of sequences in'
                         ' the file.' % str(seq_num))
    try:
        for i, seq in zip(range(1, seq_num + 1), generator):
            pass
    finally:
        generator.close()

    if i == seq_num:
        return seq
    raise ValueError('Reached end of file before finding the %s sequence.'
                     % cardinal_to_ordinal(seq_num))
Esempio n. 8
0
 def test_valid_range(self):
     # taken and modified from http://stackoverflow.com/a/20007730/3776794
     exp = [
         "0th",
         "1st",
         "2nd",
         "3rd",
         "4th",
         "5th",
         "6th",
         "7th",
         "8th",
         "9th",
         "10th",
         "11th",
         "12th",
         "13th",
         "14th",
         "15th",
         "16th",
         "17th",
         "18th",
         "19th",
         "20th",
         "21st",
         "22nd",
         "23rd",
         "24th",
         "25th",
         "26th",
         "27th",
         "28th",
         "29th",
         "30th",
         "31st",
         "32nd",
         "100th",
         "101st",
         "42042nd",
     ]
     obs = [cardinal_to_ordinal(n) for n in list(range(0, 33)) + [100, 101, 42042]]
     self.assertEqual(obs, exp)
Esempio n. 9
0
def _generator_to_fasta(obj, fh, id_whitespace_replacement='_',
                        description_newline_replacement=' ', max_width=None):
    if ((id_whitespace_replacement is not None and
         '\n' in id_whitespace_replacement) or
        (description_newline_replacement is not None and
         '\n' in description_newline_replacement)):
        raise FASTAFormatError(
            "Newline character (\\n) cannot be used to replace whitespace in "
            "biological sequence IDs, nor to replace newlines in biological "
            "sequence descriptions. Otherwise, the FASTA-formatted file will "
            "be invalid.")
    ws_pattern = re.compile(r'\s')
    nl_pattern = re.compile(r'\n')

    for idx, seq in enumerate(obj):
        if len(seq) < 1:
            raise FASTAFormatError(
                "Cannot write %s biological sequence in FASTA format because "
                "it does not contain any characters (i.e., it is an "
                "empty/blank sequence). Empty sequences are not supported in "
                "the FASTA file format." % cardinal_to_ordinal(idx + 1))

        id_ = seq.id
        if id_whitespace_replacement is not None:
            id_ = re.sub(ws_pattern, id_whitespace_replacement, id_)

        desc = seq.description
        if description_newline_replacement is not None:
            desc = re.sub(nl_pattern, description_newline_replacement, desc)

        if desc:
            header = '%s %s' % (id_, desc)
        else:
            header = id_

        seq_str = str(seq)
        if max_width is not None:
            seq_str = _chunk_str(seq_str, max_width, '\n')

        fh.write('>%s\n%s\n' % (header, seq_str))
Esempio n. 10
0
def _fasta_to_sequence(fh, seq_num, constructor):
    if seq_num < 1:
        raise FASTAFormatError(
            "Invalid sequence number (seq_num=%d). seq_num must be between 1 "
            "and the number of sequences in the FASTA-formatted file "
            "(inclusive)." % seq_num)

    seq_idx = seq_num - 1
    seq = None
    try:
        gen = _fasta_to_generator(fh, constructor=constructor)
        for idx, curr_seq in enumerate(gen):
            if idx == seq_idx:
                seq = curr_seq
                break
    finally:
        gen.close()

    if seq is None:
        raise FASTAFormatError(
            "Reached end of FASTA-formatted file before finding %s biological "
            "sequence." % cardinal_to_ordinal(seq_num))
    return seq
Esempio n. 11
0
 def test_invalid_n(self):
     with six.assertRaisesRegex(self, ValueError, '-1'):
         cardinal_to_ordinal(-1)
Esempio n. 12
0
 def test_invalid_n(self):
     with six.assertRaisesRegex(self, ValueError, '-1'):
         cardinal_to_ordinal(-1)
Esempio n. 13
0
 def test_invalid_n(self):
     with self.assertRaisesRegexp(ValueError, "-1"):
         cardinal_to_ordinal(-1)