Python FASTAの例、pypeline.common.formats.fasta.FASTA Pythonの例

コード例 #1

0

ファイルを表示

def test_fasta__from_file__compressed_bz2():
    expected = [
        FASTA("This_is_BZ_FASTA!", None, "CGTNA"),
        FASTA("This_is_ALSO_BZ_FASTA!", None, "ACGTN")
    ]
    results = list(FASTA.from_file("tests/data/fasta_file.fasta.bz2"))
    assert_equal(results, expected)

コード例 #2

0

ファイルを表示

ファイル: phylip_test.py プロジェクト: health1987/paleomix

def test_sequentual_phy__different_length_names_2():
    msa = MSA([
        FASTA("Burchelli_4", None, "ACGTTGATAACCAGG"),
        FASTA("Donkey", None, "TGCAGAGTACGACGT")
    ])
    expected = \
"""2 15

Burchelli_4             ACGTTGATAA  CCAGG
Donkey                  TGCAGAGTAC  GACGT"""
    print interleaved_phy(msa), expected
    assert_equal(interleaved_phy(msa), expected)

コード例 #3

0

ファイルを表示

def test_fasta__from_lines__multiple_records():
    lines = [
        ">first\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n",
        ">Second XT:1:0\n", "GAGAGCTCAGCTAAC\n", ">Third\n",
        "CGCTGACCAAAAACGGACAG\n", "GGCATTCGGC\n"
    ]
    expected = [
        FASTA("first", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA"),
        FASTA("Second", "XT:1:0", "GAGAGCTCAGCTAAC"),
        FASTA("Third", None, "CGCTGACCAAAAACGGACAGGGCATTCGGC")
    ]
    assert_list_equal(FASTA.from_lines(lines), expected)

コード例 #4

0

ファイルを表示

ファイル: phylip_test.py プロジェクト: health1987/paleomix

def test_sequentual_phy__different_length_names_1():
    msa = MSA([
        FASTA("A_short_name", None, "ACGTTGATAACCAGG"),
        FASTA("Another_really_long_sequence_name_that_is_too_long", None,
              "TGCAGAGTACGACGT")
    ])
    expected = \
"""2 15

A_short_name                        ACGTTGATAA  CCAGG
Another_really_long_sequence_n      TGCAGAGTAC  GACGT"""
    print interleaved_phy(msa), expected
    assert_equal(interleaved_phy(msa), expected)

コード例 #5

0

ファイルを表示

ファイル: msa.py プロジェクト: health1987/paleomix

    def filter_singletons(self, to_filter, filter_using):
        included, excluded, to_filter \
            = self._group(filter_using, to_filter)

        sequence = list(to_filter.sequence)
        sequences = [record.sequence.upper() for record in included]
        for (index, nts) in enumerate(zip(*sequences)):
            current_nt = sequence[index].upper()
            if current_nt in "N-":
                continue

            allowed_nts = set()
            for allowed_nt in nts:
                if allowed_nt not in "N-":
                    allowed_nts.update(NT_CODES[allowed_nt])
            filtered_nts = frozenset(NT_CODES[current_nt]) & allowed_nts

            if not filtered_nts:
                filtered_nts = "N"

            genotype = encode_genotype(filtered_nts)
            if genotype != current_nt:
                sequence[index] = genotype.lower()
        new_record = FASTA(to_filter.name,
                           to_filter.meta,
                           "".join(sequence))

        return MSA([new_record] + included + excluded)

コード例 #6

0

ファイルを表示

ファイル: sample_pileup.py プロジェクト: health1987/paleomix

def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--genotype",
                        help="Tabix indexed pileup file.",
                        required=True)
    parser.add_argument("--intervals", help="BED file.", required=True)
    parser.add_argument("--padding",
                        type=int,
                        default=10,
                        help="Number of bases to expand intervals, when "
                        "filtering based on adjacent indels [%default]")
    parser.add_argument("--min-distance-to-indels",
                        type=int,
                        default=5,
                        help="Variants closer than this distance from indels "
                        "are filtered [%default].")
    args = parser.parse_args(argv)

    genotype = pysam.Tabixfile(args.genotype)
    with open(args.intervals) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, pysam.asBed())

    for (_, beds) in sorted(intervals.items()):
        for (name, sequence) in build_genes(args, genotype, beds):
            FASTA(name, None, sequence).write(sys.stdout)

    return 0

コード例 #7

0

ファイルを表示

ファイル: msa.py プロジェクト: health1987/paleomix

 def from_lines(cls, lines):
     """Parses a MSA from a file/list of lines, and returns a dictionary
     of names to sequences. If read_meta is True, meta information included
     after the first space in header of each sequence:
       >NAME META-INFORMATION
       SEQUENCE
     As suggested above, sequences are expected to be in FASTA format."""
     return MSA(FASTA.from_lines(lines))

コード例 #8

0

ファイルを表示

ファイル: fasta_test.py プロジェクト: CarlesV/paleomix

def test_fasta__from_lines__multiple_records():
    lines    = [">first\n",  "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n",
                ">Second XT:1:0\n", "GAGAGCTCAGCTAAC\n",
                ">Third\n",  "CGCTGACCAAAAACGGACAG\n", "GGCATTCGGC\n"]
    expected = [FASTA("first", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA"),
                FASTA("Second", "XT:1:0", "GAGAGCTCAGCTAAC"),
                FASTA("Third", None, "CGCTGACCAAAAACGGACAGGGCATTCGGC")]
    assert_list_equal(FASTA.from_lines(lines), expected)

コード例 #9

0

ファイルを表示

ファイル: synthesize_reads.py プロジェクト: health1987/paleomix

    def __init__(self, options, filename):
        genome = list(FASTA.from_file(filename))
        assert len(genome) == 1, len(genome)

        self._genome = genome[0].sequence.upper()
        self._sequence = None
        self._positions = None
        self._annotations = None

        self._mutate(options)

コード例 #10

0

ファイルを表示

ファイル: synthesize_reads.py プロジェクト: CarlesV/paleomix

    def __init__(self, options, filename):
        genome = list(FASTA.from_file(filename))
        assert len(genome) == 1, len(genome)

        self._genome = genome[0].sequence.upper()
        self._sequence = None
        self._positions = None
        self._annotations = None

        self._mutate(options)

コード例 #11

0

ファイルを表示

ファイル: paml.py プロジェクト: CarlesV/paleomix

    def _setup(self, _config, temp):
        self._update_ctl_file(source      = self._control_file,
                              destination = os.path.join(temp, "template.ctl"))

        os.symlink(os.path.abspath(self._trees_file), os.path.join(temp, "template.trees"))
        with open(os.path.join(temp, "template.seqs"), "w") as handle:
            for record in FASTA.from_file(self._sequence_file):
                if record.name not in self._exclude_groups:
                    name     = record.name
                    sequence = record.sequence.upper()
                    handle.write("%s\n" % (FASTA(name, None, sequence),))

コード例 #12

0

ファイルを表示

ファイル: sequences.py プロジェクト: KHanghoj/epiPALEOMIX

    def _run(self, _config, temp):
        fasta_files = []
        for (name, filename) in sorted(self._infiles.iteritems()):
            fasta_files.append((name, pysam.Fastafile(filename)))

        for sequence_name in sorted(self._sequences):
            filename = os.path.join(temp, sequence_name + ".fasta")
            with open(filename, "w") as out_handle:
                for (sample, fasta_file) in fasta_files:
                    sequence = fasta_file.fetch(sequence_name)
                    fasta = FASTA(sample, sequence_name, sequence)
                    out_handle.write(str(fasta))

コード例 #13

0

ファイルを表示

ファイル: msa.py プロジェクト: health1987/paleomix

    def reduce(self):
        columns = []
        uncalled = frozenset("Nn-")
        for column in izip(*(record.sequence for record in self)):
            if (frozenset(column) - uncalled):
                columns.append(column)

        if not columns:
            return None

        records = []
        for (record, sequence) in izip(self, izip(*columns)):
            records.append(FASTA(record.name, record.meta, "".join(sequence)))

        return MSA(records)

コード例 #14

0

ファイルを表示

ファイル: msa.py プロジェクト: health1987/paleomix

    def join(cls, *msas):
        """Merge multiple MSAs into a single MSA, by concatenating sequences in
        the order of the passed MSAs. Sequences are joined by name, and all MSAs
        must therefore contain the same set of sequence names. Meta information
        is not preserved."""
        cls.validate(*msas)

        merged = defaultdict(list)
        for msa in msas:
            for record in msa:
                merged[record.name].append(record.sequence)

        sequences = []
        for (name, sequence) in merged.iteritems():
            sequences.append(FASTA(name, None, "".join(sequence)))
        return MSA(sequences)

コード例 #15

0

ファイルを表示

ファイル: msa.py プロジェクト: health1987/paleomix

    def split(self, split_by = "123"):
        """Splits a MSA and returns a dictionary of keys to MSAs,
        using the keys in the 'split_by' parameter at the top
        level. See also pypeline.common.sequences.split."""
        self.validate(self)
        if not split_by:
            raise TypeError("No partitions to split by specified")

        results = dict((key, set()) for key in split_by)
        for record in self:
            for (key, partition) in split(record.sequence, split_by).iteritems():
                results[key].add(FASTA(record.name, None, partition))

        for (key, value) in results.items():
            results[key] = MSA(value)

        return results

コード例 #16

0

ファイルを表示

ファイル: sequences.py プロジェクト: KHanghoj/epiPALEOMIX

    def _run(self, _config, temp):
        def _by_name(bed):
            return bed.name

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._bedfile) as bedfile:
            bedrecords = text.parse_lines_by_contig(bedfile, BEDRecord)
            for (contig, beds) in sorted(bedrecords.iteritems()):
                beds.sort(key=lambda bed: (bed.contig, bed.name, bed.start))

                for (gene, gene_beds) in itertools.groupby(beds, _by_name):
                    gene_beds = tuple(gene_beds)
                    sequence = self._collect_sequence(fastafile, gene_beds)
                    seqs[(contig, gene)] = sequence

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                FASTA(gene, None, sequence).write(out_file)

        fileutils.move_file(temp_file, self._outfile)

コード例 #17

0

ファイルを表示

ファイル: fasta_test.py プロジェクト: CarlesV/paleomix

def test_fasta__from_file__compressed_bz2():
    expected = [FASTA("This_is_BZ_FASTA!", None, "CGTNA"),
                FASTA("This_is_ALSO_BZ_FASTA!", None, "ACGTN")]
    results  = list(FASTA.from_file("tests/data/fasta_file.fasta.bz2"))
    assert_equal(results, expected)

コード例 #18

0

ファイルを表示

def test_fasta__unimplemented_comparison():
    assert_is(NotImplemented, FASTA("A", None, "C").__eq__(10))
    assert_is(NotImplemented, FASTA("A", None, "C").__lt__(10))
    assert_is(NotImplemented, FASTA("A", None, "C").__le__(10))
    assert_is(NotImplemented, FASTA("A", None, "C").__ge__(10))
    assert_is(NotImplemented, FASTA("A", None, "C").__gt__(10))

コード例 #19

0

ファイルを表示

def test_fasta__from_lines__empty_record_last():
    lines = [">fasta1\n", "ACGT\n", ">fasta2\n"]
    list(FASTA.from_lines(lines))

コード例 #20

0

ファイルを表示

def test_fasta__from_lines__empty_name__alone():
    lines = [">\n", "ACGT\n"]
    list(FASTA.from_lines(lines))

コード例 #21

0

ファイルを表示

def test_fasta__from_lines__empty_name__with_others():
    lines = [">\n", "ACGT\n", ">Foo\n", "ACGGTA\n"]
    list(FASTA.from_lines(lines))

コード例 #22

0

ファイルを表示

def test_fasta__repr__partial_line_test():
    expected = ">foobar\n%s\n" % (_SEQ_FRAG, )
    result = repr(FASTA("foobar", None, _SEQ_FRAG))
    assert_equal(result, expected)

コード例 #23

0

ファイルを表示

def test_fasta__sorting_less_equal():
    assert not FASTA("A", "B", "C") < FASTA("A", "B", "C")
    assert_less(FASTA("A", "B", "C"), FASTA("B", "B", "C"))
    assert_less(FASTA("A", "B", "C"), FASTA("A", "C", "C"))
    assert_less(FASTA("A", "B", "C"), FASTA("A", "B", "D"))
    assert_less_equal(FASTA("A", "B", "C"), FASTA("A", "B", "C"))
    assert_less_equal(FASTA("A", "B", "C"), FASTA("B", "B", "C"))
    assert_less_equal(FASTA("A", "B", "C"), FASTA("A", "C", "C"))
    assert_less_equal(FASTA("A", "B", "C"), FASTA("A", "B", "D"))

コード例 #24

0

ファイルを表示

def test_fasta__sorting_greater_equal():
    assert not FASTA("A", "B", "C") > FASTA("A", "B", "C")
    assert_greater(FASTA("B", "B", "C"), FASTA("A", "B", "C"))
    assert_greater(FASTA("A", "C", "C"), FASTA("A", "B", "C"))
    assert_greater(FASTA("A", "B", "D"), FASTA("A", "B", "C"))
    assert_greater_equal(FASTA("A", "B", "C"), FASTA("A", "B", "C"))
    assert_greater_equal(FASTA("B", "B", "C"), FASTA("A", "B", "C"))
    assert_greater_equal(FASTA("A", "C", "C"), FASTA("A", "B", "C"))
    assert_greater_equal(FASTA("A", "B", "D"), FASTA("A", "B", "C"))

コード例 #25

0

ファイルを表示

ファイル: fasta_test.py プロジェクト: CarlesV/paleomix

def test_fasta__from_lines_single_record():
    lines    = [">single\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n"]
    expected = [FASTA("single", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA")]
    assert_list_equal(FASTA.from_lines(lines), expected)

コード例 #26

0

ファイルを表示

ファイル: fasta_test.py プロジェクト: CarlesV/paleomix

def test_fasta__from_lines__no_records():
    assert_list_equal(FASTA.from_lines([]), [])

コード例 #27

0

ファイルを表示

def test_fasta__repr__multiple_lines():
    expected = ">foobar\n%s\n%s\n" \
        % (_SEQ_FRAG * 10, _SEQ_FRAG * 5)
    result = repr(FASTA("foobar", None, _SEQ_FRAG * 15))
    assert_equal(result, expected)

コード例 #28

0

ファイルを表示

def test_fasta__repr__complete_line_test():
    expected = ">barfoo\n%s\n" % (_SEQ_FRAG * 10, )
    result = repr(FASTA("barfoo", None, _SEQ_FRAG * 10))
    assert_equal(result, expected)

コード例 #29

0

ファイルを表示

def test_fasta__inequality():
    assert_not_equal(FASTA("A", "B", "C"), FASTA("A", "B", "D"))
    assert_not_equal(FASTA("A", "B", "C"), FASTA("A", None, "C"))
    assert_not_equal(FASTA("A", "B", "C"), FASTA("D", "B", "C"))

コード例 #30

0

ファイルを表示

def test_fasta__from_lines__empty_record_name_only__first():
    list(FASTA.from_lines([">fasta1\n", ">fasta2\n", "AGTC\n"]))

コード例 #31

0

ファイルを表示

def _simple_fasta_record():
    return FASTA("Dummy", "Meta-inf", "ACGT")

コード例 #32

0

ファイルを表示

ファイル: fasta_test.py プロジェクト: CarlesV/paleomix

def test_fasta__from_lines__empty_record_name_only__first():
    list(FASTA.from_lines([">fasta1\n", ">fasta2\n", "AGTC\n"]))

コード例 #33

0

ファイルを表示

ファイル: fasta_test.py プロジェクト: CarlesV/paleomix

def test_fasta__from_lines__empty_record__middle():
    lines = [">fasta0\n", "ACGT\n", ">fasta1\n", ">fasta2\n", "AGTC\n"]
    list(FASTA.from_lines(lines))

コード例 #34

0

ファイルを表示

def test_fasta__equality():
    assert_equal(FASTA("A", "B", "C"), FASTA("A", "B", "C"))

コード例 #35

0

ファイルを表示

ファイル: fasta_test.py プロジェクト: CarlesV/paleomix

def test_fasta__from_lines__empty_record_last():
    lines = [">fasta1\n", "ACGT\n", ">fasta2\n"]
    list(FASTA.from_lines(lines))

コード例 #36

0

ファイルを表示

ファイル: phylip_test.py プロジェクト: health1987/paleomix

from nose.tools import assert_equal

from pypeline.common.formats.phylip import \
     sequential_phy, \
     interleaved_phy

from pypeline.common.formats.msa import \
     MSA

from pypeline.common.formats.fasta import \
     FASTA



_MSA_SHORT_SEQUENCES = \
  MSA([FASTA("seq1", None, "ACGTTGATAACCAGG"),
       FASTA("seq2", None, "TGCAGAGTACGACGT")])
_MSA_MEDIUM_SEQUENCES = \
  MSA([FASTA("seq1", None, "ACGTTGATAACCAGGAGGGATTCGCGATTGGTGGTAACGTAGCC"),
       FASTA("seq2", None, "TGCAGAGTACGACGTCTCCTAGATCCTGGACAATTTAAACCGAA")])
_MSA_LONG_SEQUENCES  = \
  MSA([FASTA("seq1", None, "CGGATCTGCTCCTCCACTGGCCACGTTTACTGTCCCCCAACCGTT" \
             "CGTCCCGACCTAGTTATACTTCTTAGCAAGGTGTAAAACCAGAGATTGAGGTTATAACG" \
             "TTCCTAATCAGTTATTAAATTACCGCGCCCCGACAG"),
       FASTA("seq2", None, "AGTTGAAGAGGCGGAACGTTTGTAAACCGCGCTAACGTAGTTCTA" \
             "CAACCAGCCACCCGGTTCGAAGGAACAACTGGTCGCCATAATTAGGCGAAACGATAGTG" \
             "CACTAAGGTCAGGTGCGCCCCTGTAAATAATTAGAT")])

_MSA_MEDIUM_NAMES = \
  MSA([FASTA("A_really_long_sequence", None, "ACGTTGATAACCAGG"),
       FASTA("Another_real_long_one!", None, "TGCAGAGTACGACGT")])

コード例 #37

0

ファイルを表示

ファイル: fasta_test.py プロジェクト: CarlesV/paleomix

def test_fasta__from_lines__missing_name__alone():
    lines = ["ACGT\n"]
    list(FASTA.from_lines(lines))

コード例 #38

0

ファイルを表示

def test_fasta__from_lines__missing_name__alone():
    lines = ["ACGT\n"]
    list(FASTA.from_lines(lines))

コード例 #39

0

ファイルを表示

ファイル: fasta_test.py プロジェクト: CarlesV/paleomix

def test_fasta__from_lines__empty_name__alone():
    lines = [">\n", "ACGT\n"]
    list(FASTA.from_lines(lines))

コード例 #40

0

ファイルを表示

def test_fasta__from_lines__empty_record__middle():
    lines = [">fasta0\n", "ACGT\n", ">fasta1\n", ">fasta2\n", "AGTC\n"]
    list(FASTA.from_lines(lines))

コード例 #41

0

ファイルを表示

ファイル: fasta_test.py プロジェクト: CarlesV/paleomix

def test_fasta__from_lines__empty_name__with_others():
    lines = [">\n", "ACGT\n", ">Foo\n", "ACGGTA\n"]
    list(FASTA.from_lines(lines))

コード例 #42

0

ファイルを表示

ファイル: fasta_test.py プロジェクト: CarlesV/paleomix

def test_fasta__from_lines__empty_record_name_only__nothing_else():
    list(FASTA.from_lines([">fasta1\n"]))

コード例 #43

0

ファイルを表示

def test_fasta__hash():
    assert_equal(hash(FASTA("A", "B", "C")), hash(FASTA("A", "B", "C")))
    assert_not_equal(hash(FASTA("A", "B", "C")), hash(FASTA("B", "B", "C")))
    assert_not_equal(hash(FASTA("A", "B", "C")), hash(FASTA("A", "C", "C")))
    assert_not_equal(hash(FASTA("A", "B", "C")), hash(FASTA("A", "B", "D")))