Esempio n. 1
0
def _read_sequences(filenames):
    results = {}
    for filename in filenames:
        results[filename] = MSA.from_file(filename)
    MSA.validate(*results.values())

    return results.iteritems()
Esempio n. 2
0
def _read_sequences(filenames):
    results = {}
    for filename in filenames:
        results[filename] = MSA.from_file(filename)
    MSA.validate(*results.values())

    return results.iteritems()
Esempio n. 3
0
def interleaved_phy(msa, add_flag = False, max_name_length = _MAX_NAME_LENGTH):
    MSA.validate(msa)
    header = "%i %i" % (len(msa), msa.seqlen())
    if add_flag:
        header += " I"
    result = [header, ""]

    padded_len  = min(max_name_length, max(len(name) for name in msa.names())) + 2
    padded_len -= padded_len % -(_BLOCK_SIZE + _BLOCK_SPACING) + _BLOCK_SPACING

    streams = []
    spacing = " " * _BLOCK_SPACING
    for record in sorted(msa):
        name    = record.name[:max_name_length]
        padding = (padded_len - len(name)) * " "

        lines = []
        line  = [name, padding]
        for block in grouper(_BLOCK_SIZE, record.sequence, fillvalue = ""):
            block = "".join(block)
            if sum(len(segment) for segment in line) >= _LINE_SIZE:
                lines.append("".join(line))
                line = [block]
            else:
                line.extend((spacing, block))

        lines.append("".join(line))
        streams.append(lines)

    for rows in zip(*streams):
        result.extend(row for row in rows)
        result.append("")
    result.pop()

    return "\n".join(result)
Esempio n. 4
0
def test_msa_exclude__remove_one():
    fa_1 = FASTA("A", None, "ACGT")
    fa_2 = FASTA("B", None, "GCTA")
    initial  = MSA([fa_1, fa_2])
    expected = MSA([fa_1])
    result   = initial.exclude(["B"])
    assert_equal(result, expected)
Esempio n. 5
0
 def _teardown(self, config, temp):
     # Validate output from MAFFT
     output_file = reroot_path(temp, self._output_file)
     try:
         MSA.from_file(output_file)
     except MSAError, error:
         raise NodeError("Invalid MSA produced by MAFFT:\n%s" % (error,))
Esempio n. 6
0
def test_msa_reduce__multiple_empty_column__all_empty_column_are_removed():
    fa_1 = FASTA("Name_A", "Meta_A", "-AnTN")
    fa_2 = FASTA("Name_B", "Meta_B", "NC-NN")
    initial = MSA([fa_1, fa_2])
    fa_reduced_1 = FASTA("Name_A", "Meta_A", "AT")
    fa_reduced_2 = FASTA("Name_B", "Meta_B", "CN")
    expected = MSA([fa_reduced_1, fa_reduced_2])
    assert_equal(initial.reduce(), expected)
Esempio n. 7
0
def test_msa_to_file__complete_line_test():
    msa       = MSA([FASTA("barfoo", None, "ACGATA" * 10 + "CGATAG" * 5),
                     FASTA("foobar", None, "CGAATG" * 10 + "TGTCAT" * 5)])
    expected  = ">barfoo\n%s\n%s\n" % ("ACGATA" * 10, "CGATAG" * 5)
    expected += ">foobar\n%s\n%s\n" % ("CGAATG" * 10, "TGTCAT" * 5)
    stringf = StringIO.StringIO()
    MSA.to_file(msa, stringf)
    assert_equal(stringf.getvalue(), expected)
Esempio n. 8
0
def test_msa_split_msa__two_groups():
    msa = MSA([FASTA("seq1", None, "ACGCAT"),
               FASTA("seq2", None, "GAGTGA")])
    expected = {"1" : MSA([FASTA("seq1", None, "ACCA"),
                           FASTA("seq2", None, "GATG")]),
                "2" : MSA([FASTA("seq1", None, "GT"),
                           FASTA("seq2", None, "GA")])}
    assert_equal(msa.split("112"), expected)
Esempio n. 9
0
def test_msa_split__partial_group():
    msa = MSA([FASTA("seq1", None, "ACGCA"),
               FASTA("seq2", None, "GAGTG")])
    expected = {"1" : MSA([FASTA("seq1", None, "AC"),
                           FASTA("seq2", None, "GT")]),
                "2" : MSA([FASTA("seq1", None, "CA"),
                           FASTA("seq2", None, "AG")]),
                "3" : MSA([FASTA("seq1", None, "G"),
                           FASTA("seq2", None, "G")])}
    assert_equal(msa.split("123"), expected)
Esempio n. 10
0
def test_msa_split__empty_group():
    msa = MSA([FASTA("seq1", None, "AC"),
               FASTA("seq2", None, "GA")])
    expected = {"1" : MSA([FASTA("seq1", None, "A"),
                           FASTA("seq2", None, "G")]),
                "2" : MSA([FASTA("seq1", None, "C"),
                           FASTA("seq2", None, "A")]),
                "3" : MSA([FASTA("seq1", None, ""),
                           FASTA("seq2", None, "")])}
    assert_equal(msa.split("123"), expected)
Esempio n. 11
0
    def _run(self, _config, temp):
        # Read and check that MSAs share groups
        msas = [MSA.from_file(filename) for filename in sorted(self.input_files)]
        MSA.validate(*msas)

        blocks = []
        for msa in msas:
            blocks.append(sequential_phy(msa, add_flag = self._add_flag))

        with open(reroot_path(temp, self._out_phy), "w") as output:
            output.write("\n\n".join(blocks))
Esempio n. 12
0
    def _run(self, _config, temp):
        # Read and check that MSAs share groups
        msas = [
            MSA.from_file(filename) for filename in sorted(self.input_files)
        ]
        MSA.validate(*msas)

        blocks = []
        for msa in msas:
            blocks.append(sequential_phy(msa, add_flag=self._add_flag))

        with open(reroot_path(temp, self._out_phy), "w") as output:
            output.write("\n\n".join(blocks))
Esempio n. 13
0
def sequential_phy(msa, add_flag = False, max_name_length = _MAX_NAME_LENGTH):
    MSA.validate(msa)
    header = "%i %i" % (len(msa), msa.seqlen())
    if add_flag:
        header += " S"

    spacing = " " * _BLOCK_SPACING
    result = [header, ""]
    for record in sorted(msa):
        result.append(record.name[:max_name_length])

        blocks = grouper(_BLOCK_SIZE, record.sequence, fillvalue = "")
        lines  = grouper(_NUM_BLOCKS, blocks)
        for line in lines:
            result.append(spacing.join("".join(block) for block in line if block))

    return "\n".join(result)
Esempio n. 14
0
    def _run(self, _config, temp):
        alignment = MSA.from_file(self._input_file)
        for (to_filter, groups) in self._filter_by.iteritems():
            alignment = alignment.filter_singletons(to_filter, groups)

        temp_filename = fileutils.reroot_path(temp, self._output_file)
        with open(temp_filename, "w") as handle:
            alignment.to_file(handle)
        fileutils.move_file(temp_filename, self._output_file)
Esempio n. 15
0
    def _run(self, _config, temp):
        alignment = MSA.from_file(self._input_file)
        for (to_filter, groups) in self._filter_by.iteritems():
            alignment = alignment.filter_singletons(to_filter, groups)

        temp_filename = fileutils.reroot_path(temp, self._output_file)
        with open(temp_filename, "w") as handle:
            alignment.to_file(handle)
        fileutils.move_file(temp_filename, self._output_file)
Esempio n. 16
0
def sequential_phy(msa, add_flag=False, max_name_length=_MAX_NAME_LENGTH):
    MSA.validate(msa)
    header = "%i %i" % (len(msa), msa.seqlen())
    if add_flag:
        header += " S"

    spacing = " " * _BLOCK_SPACING
    result = [header, ""]
    for record in sorted(msa):
        result.append(record.name[:max_name_length])

        blocks = grouper(_BLOCK_SIZE, record.sequence, fillvalue="")
        lines = grouper(_NUM_BLOCKS, blocks)
        for line in lines:
            result.append(
                spacing.join("".join(block) for block in line if block))

    return "\n".join(result)
Esempio n. 17
0
def test_sequentual_phy__different_length_names_2():
    msa = MSA([
        FASTA("Burchelli_4", None, "ACGTTGATAACCAGG"),
        FASTA("Donkey", None, "TGCAGAGTACGACGT")
    ])
    expected = \
"""2 15

Burchelli_4             ACGTTGATAA  CCAGG
Donkey                  TGCAGAGTAC  GACGT"""
    print interleaved_phy(msa), expected
    assert_equal(interleaved_phy(msa), expected)
Esempio n. 18
0
def _is_sufficently_covered(filepath, min_coverage):
    msa = MSA.from_file(filepath)
    if msa.seqlen() % 3:
        return False

    total_bases_not_covered = 0
    for fasta_record in msa:
        total_bases_not_covered += fasta_record.sequence.upper().count("N")
        total_bases_not_covered += fasta_record.sequence.count("-")

    total_bases = float(len(msa) * msa.seqlen())
    frac_covered = 1.0 - total_bases_not_covered / total_bases
    return frac_covered >= min_coverage
Esempio n. 19
0
def test_sequentual_phy__different_length_names_1():
    msa = MSA([
        FASTA("A_short_name", None, "ACGTTGATAACCAGG"),
        FASTA("Another_really_long_sequence_name_that_is_too_long", None,
              "TGCAGAGTACGACGT")
    ])
    expected = \
"""2 15

A_short_name                        ACGTTGATAA  CCAGG
Another_really_long_sequence_n      TGCAGAGTAC  GACGT"""
    print interleaved_phy(msa), expected
    assert_equal(interleaved_phy(msa), expected)
def _is_sufficently_covered(filepath, min_coverage):
    msa = MSA.from_file(filepath)
    if msa.seqlen() % 3:
        return False

    total_bases_not_covered = 0
    for fasta_record in msa:
        total_bases_not_covered += fasta_record.sequence.upper().count("N")
        total_bases_not_covered += fasta_record.sequence.count("-")

    total_bases = float(len(msa) * msa.seqlen())
    frac_covered = 1.0 - total_bases_not_covered / total_bases
    return frac_covered >= min_coverage
Esempio n. 21
0
    def _run(self, _config, temp):
        merged_msas = []
        for (name, files_dd) in sorted(self._infiles.iteritems()):
            partitions = files_dd["partitions"]
            msas = dict((key, []) for key in partitions)
            for filename in files_dd["filenames"]:
                msa = MSA.from_file(filename)
                if self._excluded:
                    msa = msa.exclude(self._excluded)

                for (key, msa_part) in msa.split(partitions).iteritems():
                    msas[key].append(msa_part)

            msas.pop("X", None)
            for (key, msa_parts) in sorted(msas.iteritems()):
                merged_msa = MSA.join(*msa_parts)
                if self._reduce:
                    merged_msa = merged_msa.reduce()

                if merged_msa is not None:
                    merged_msas.append(("%s_%s" % (name, key),
                                        merged_msa))

        out_fname_phy = reroot_path(temp, self._out_prefix + ".phy")
        with open(out_fname_phy, "w") as output_phy:
            final_msa = MSA.join(*(msa for (_, msa) in merged_msas))
            output_phy.write(interleaved_phy(final_msa))

        partition_end = 0
        out_fname_parts = reroot_path(temp, self._out_prefix + ".partitions")
        with open(out_fname_parts, "w") as output_part:
            for (name, msa) in merged_msas:
                length = msa.seqlen()
                output_part.write("DNA, %s = %i-%i\n"
                                  % (name,
                                     partition_end + 1,
                                     partition_end + length))
                partition_end += length
Esempio n. 22
0
def interleaved_phy(msa, add_flag=False, max_name_length=_MAX_NAME_LENGTH):
    MSA.validate(msa)
    header = "%i %i" % (len(msa), msa.seqlen())
    if add_flag:
        header += " I"
    result = [header, ""]

    padded_len = min(max_name_length, max(len(name)
                                          for name in msa.names())) + 2
    padded_len -= padded_len % -(_BLOCK_SIZE + _BLOCK_SPACING) + _BLOCK_SPACING

    streams = []
    spacing = " " * _BLOCK_SPACING
    for record in sorted(msa):
        name = record.name[:max_name_length]
        padding = (padded_len - len(name)) * " "

        lines = []
        line = [name, padding]
        for block in grouper(_BLOCK_SIZE, record.sequence, fillvalue=""):
            block = "".join(block)
            if sum(len(segment) for segment in line) >= _LINE_SIZE:
                lines.append("".join(line))
                line = [block]
            else:
                line.extend((spacing, block))

        lines.append("".join(line))
        streams.append(lines)

    for rows in zip(*streams):
        result.extend(row for row in rows)
        result.append("")
    result.pop()

    return "\n".join(result)
Esempio n. 23
0
    def _run(self, _config, temp):
        merged_msas = []
        for (name, files_dd) in sorted(self._infiles.iteritems()):
            partitions = files_dd["partitions"]
            msas = dict((key, []) for key in partitions)
            for filename in files_dd["filenames"]:
                msa = MSA.from_file(filename)
                if self._excluded:
                    msa = msa.exclude(self._excluded)

                for (key, msa_part) in msa.split(partitions).iteritems():
                    msas[key].append(msa_part)

            msas.pop("X", None)
            for (key, msa_parts) in sorted(msas.iteritems()):
                merged_msa = MSA.join(*msa_parts)
                if self._reduce:
                    merged_msa = merged_msa.reduce()

                if merged_msa is not None:
                    merged_msas.append(("%s_%s" % (name, key), merged_msa))

        out_fname_phy = reroot_path(temp, self._out_prefix + ".phy")
        with open(out_fname_phy, "w") as output_phy:
            final_msa = MSA.join(*(msa for (_, msa) in merged_msas))
            output_phy.write(interleaved_phy(final_msa))

        partition_end = 0
        out_fname_parts = reroot_path(temp, self._out_prefix + ".partitions")
        with open(out_fname_parts, "w") as output_part:
            for (name, msa) in merged_msas:
                length = msa.seqlen()
                output_part.write(
                    "DNA, %s = %i-%i\n" %
                    (name, partition_end + 1, partition_end + length))
                partition_end += length
Esempio n. 24
0
from nose.tools import assert_equal

from pypeline.common.formats.phylip import \
     sequential_phy, \
     interleaved_phy

from pypeline.common.formats.msa import \
     MSA

from pypeline.common.formats.fasta import \
     FASTA



_MSA_SHORT_SEQUENCES = \
  MSA([FASTA("seq1", None, "ACGTTGATAACCAGG"),
       FASTA("seq2", None, "TGCAGAGTACGACGT")])
_MSA_MEDIUM_SEQUENCES = \
  MSA([FASTA("seq1", None, "ACGTTGATAACCAGGAGGGATTCGCGATTGGTGGTAACGTAGCC"),
       FASTA("seq2", None, "TGCAGAGTACGACGTCTCCTAGATCCTGGACAATTTAAACCGAA")])
_MSA_LONG_SEQUENCES  = \
  MSA([FASTA("seq1", None, "CGGATCTGCTCCTCCACTGGCCACGTTTACTGTCCCCCAACCGTT" \
             "CGTCCCGACCTAGTTATACTTCTTAGCAAGGTGTAAAACCAGAGATTGAGGTTATAACG" \
             "TTCCTAATCAGTTATTAAATTACCGCGCCCCGACAG"),
       FASTA("seq2", None, "AGTTGAAGAGGCGGAACGTTTGTAAACCGCGCTAACGTAGTTCTA" \
             "CAACCAGCCACCCGGTTCGAAGGAACAACTGGTCGCCATAATTAGGCGAAACGATAGTG" \
             "CACTAAGGTCAGGTGCGCCCCTGTAAATAATTAGAT")])

_MSA_MEDIUM_NAMES = \
  MSA([FASTA("A_really_long_sequence", None, "ACGTTGATAACCAGG"),
       FASTA("Another_real_long_one!", None, "TGCAGAGTACGACGT")])
_MSA_LONG_NAMES = \
Esempio n. 25
0
def test_msa_join__three_msa():
    expected = MSA((FASTA("nc",    None, "ACGTGAAAG"),
                    FASTA("nm",    None, "TGACTTGAG"),
                    FASTA("miRNA", None, "UCAGACCAU")))
    result = MSA.join(_JOIN_MSA_1, _JOIN_MSA_2, _JOIN_MSA_3)
    assert_equal(result, expected)
Esempio n. 26
0
def test_msa_join__missing_arguments():
    MSA.join()
Esempio n. 27
0
def test_msa_split_msa__single_group():
    msa = MSA([FASTA("seq1", None, "ACGCAT"),
               FASTA("seq2", None, "GAGTGA")])
    expected = {'1' : copy.copy(msa)}
    assert_equal(msa.split("111"), expected)
Esempio n. 28
0
def test_msa_reduce__only_empty_column__none_is_returned():
    fa_1 = FASTA("Name_A", "Meta_A", "---Nn")
    fa_2 = FASTA("Name_B", "Meta_B", "Nn--N")
    initial = MSA([fa_1, fa_2])
    assert_equal(initial.reduce(), None)
Esempio n. 29
0
def test_msa_from_lines__two_entries_with_meta():
    lines    = [">seq1", "ACG", ">seq2 Second meta", "TGA"]
    expected = MSA([FASTA("seq1", None, "ACG"),
                    FASTA("seq2", "Second meta", "TGA")])
    result   = MSA.from_lines(lines)
    assert_equal(result, expected)
Esempio n. 30
0
def test_msa_join__two_msa():
    expected = MSA((FASTA("nc",    None, "ACGTGA"),
                    FASTA("nm",    None, "TGACTT"),
                    FASTA("miRNA", None, "UCAGAC")))
    result = MSA.join(_JOIN_MSA_1, _JOIN_MSA_2)
    assert_equal(result, expected)
Esempio n. 31
0
def test_msa_from_lines__duplicate_names():
    MSA.from_lines([">seq1", "ACG", ">seq1", "TGA"])
Esempio n. 32
0
    def _run(self, _config, temp):
        msa = MSA.join(*(MSA.from_file(filename) for filename in sorted(self.input_files)))

        with open(reroot_path(temp, self._out_phy), "w") as output:
            output.write(interleaved_phy(msa, add_flag = self._add_flag))
Esempio n. 33
0
def test_msa_from_lines__mismatched_lengths():
    MSA.from_lines([">seq1", "ACG", ">seq2", "TGAN"])
Esempio n. 34
0
def test_msa_from_lines__empty_name():
    MSA.from_lines([">", "ACG", ">seq1", "TGAN"])
Esempio n. 35
0
def test_msa_from_file__compressed_bz2():
    expected = MSA([FASTA("This_is_BZ_FASTA!", None, "CGTNA"),
                    FASTA("This_is_ALSO_BZ_FASTA!", None,  "ACGTN")])
    results  = MSA.from_file("tests/data/fasta_file.fasta.bz2")
    assert_equal(results, expected)
Esempio n. 36
0
def test_msa_join__single_msa():
    result = MSA.join(_JOIN_MSA_1)
    assert_equal(result, _JOIN_MSA_1)
Esempio n. 37
0
    def _run(self, _config, temp):
        msa = MSA.join(*(MSA.from_file(filename)
                         for filename in sorted(self.input_files)))

        with open(reroot_path(temp, self._out_phy), "w") as output:
            output.write(interleaved_phy(msa, add_flag=self._add_flag))
Esempio n. 38
0
def test_msa_from_lines__single_entry_with_meta():
    lines    = [">seq1 Meta info", "ACG"]
    expected = MSA([FASTA("seq1", "Meta info", "ACG")])
    result   = MSA.from_lines(lines)
    assert_equal(result, expected)
Esempio n. 39
0
def test_msa_from_lines__single_entry():
    lines  = [">seq1", "ACG"]
    result = MSA([FASTA("seq1", None, "ACG")])
    assert_equal(MSA.from_lines(lines), result)
Esempio n. 40
0
def test_msa__seqlen__corresponds_to_sequence_lengths():
    msa = MSA((FASTA("seq1",    None, "ACGCGTATGCATGCCGA"),
               FASTA("seq2",    None, "TGAACACACAGTAGGAT")))
    assert_equal(msa.seqlen(), 17)
Esempio n. 41
0
def test_msa_split_msa__no_split_by():
    msa = MSA([FASTA("seq1", None, "ACG"),
               FASTA("seq2", None, "GAT")])
    msa.split(split_by = "")