Example #1
0
__author__="pmoreno"
__date__ ="$May 29, 2011 5:16:28 PM$"

if __name__ == "__main__":
    #dirOfHMMModels = sys.argv[1]
    fastaFileCladeNoGeneralSignal = sys.argv[1]
    fastaFileClade = sys.argv[2]
    entryToTest = int(sys.argv[3])
    resultFolder = sys.argv[4]

    from Bio import AlignIO, SeqIO
    from Bio.Alphabet import IUPAC, Gapped
    from Bio.Align import MultipleSeqAlignment

    alignmentNoGenSignalIterator = AlignIO.parse(fastaFileCladeNoGeneralSignal,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-"));
    alignmentIterator = AlignIO.parse(fastaFileClade,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-"));

    noGenSignalAlignment = alignmentNoGenSignalIterator.next()
    queryFasta = resultFolder+"/"+"Query_%d.faa" % (entryToTest,)
    ownCladeProfile = resultFolder+"/"+"ForOwnCladeProfile_%d.faa" % (entryToTest,)
    #print testAlignment[entryToTest].id
    #print testAlignment[entryToTest].seq

    alignmentWithSignal = alignmentIterator.next()
    desiredSeqString = str(alignmentWithSignal[entryToTest-1].seq)
    desiredSeqString = desiredSeqString.replace("-", "")
    #print desiredSeqString
    seqNoGaps = Seq(desiredSeqString, alphabet=IUPAC.ExtendedIUPACProtein())
    #print seqNoGaps
    seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignmentWithSignal[entryToTest-1].id)
    def quantitative_analyzes(self, region):
        """
        Function to analyze all paths in created tree from 'tree_building.py' for each patient you chosed.
        Here days for each patient are stored in patient's 'X' and probability for each path to be human's 
        protein is stored in Y.
        
        Return:
            self.patients_evolution: dict, dict of dicts -> {patient:{'X': X, 'Y': [Ys for all paths]}}
        """
        if region not in self.broken_regions:

            # preparing dict to return
            self.patients_evolution = {}

            # making k-mers
            aa_k_mer_list = data_prep_k_mer.making_aa_k_mers(2)

            # preparing references for all patients
            ref = patients_data.Reference('data/hivevo')

            # for-loop for patients
            for patient in self.patients_list:

                # We will not use patient#3 and patient#10 because their HIV wasn't cool at all
                # joke, additional info can be found here (https://elifesciences.org/articles/11282)

                if patient != 'p3' and patient != 'p10':

                    # creating dataset for patient
                    pat_class = patients_data.Patient(patient)
                    pat_data = pat_class.regions[region]

                    # extracting reference
                    ref_data = ref.get_patient(patient, region=region)

                    # adding reference to dataset -> now we are ready to construct tree
                    pat_data = pd.concat([ref_data, pat_data], ignore_index=True).sort_values(by=['days'])
                    # print(pat_data)

                    # Constructing tree
                    tree = tree_building.Tree(pat_data)
                    tree.build()

                    # Seqs data converting
                    seq_data = tree.mapping

                    prot_dict = {}  # making protein dictionary

                    for day_seq in list(seq_data.keys()):
                        id_ = seq_data[day_seq]
                        prot_dict[id_] = Seq(day_seq[1], Gapped(IUPAC.unambiguous_dna)).ungap().translate()

                    # Dealing with graph
                    vertices = [i for i in range(len(tree.mapping))]
                    edges = tree.graph

                    g = graph.Graph()
                    g.add_vertices(vertices)

                    # setting correct weights
                    g.set_edge_weights(edges)

                    # getting all paths
                    phylo_paths = g.all_paths()

                    # Creating unique days
                    days = set()
                    for day, _ in list(tree.mapping.keys()):
                        days.add(day)
                    days = sorted(list(days))

                    # adding patient
                    self.patients_evolution[patient] = {}
                    self.patients_evolution[patient]['X'] = None
                    self.patients_evolution[patient]['Y'] = []

                    # Making X
                    self.patients_evolution[patient]['X'] = days

                    # Using classificator to find out probability to be human's gene

                    for path in phylo_paths:
                        met = self.clf_metric_2_mer_path(path, prot_dict, aa_k_mer_list)
                        Y = self.classificator.predict_proba(met)[:, 1]
                        self.patients_evolution[patient]['Y'].append(Y)
        else:
            print('There is no data for this region. Please choose other one or consider haplotype calling for this region')
Example #3
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" %
                             (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect")
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()

        q = _extract_alignment_region(query_seq, query_tags)
        if tool in ["TFASTX"] and len(match_seq) == len(q):
            m = match_seq
            # Quick hack until I can work out how -, * and / characters
            # and the apparent mix of aa and bp coordinates works.
        else:
            m = _extract_alignment_region(match_seq, match_tags)
        if len(q) != len(m):
            raise ValueError(f"""\
Darn... amino acids vs nucleotide coordinates?
tool: {tool}
query_seq: {query_seq}
query_tags: {query_tags}
{q} length: {len(q)}
match_seq: {match_seq}
match_tags: {match_tags}
{m} length: {len(m)}
handle.name: {handle.name}
""")

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        # TODO - Introduce an annotated alignment class?
        # See also Bio/AlignIO/MafIO.py for same requirement.
        # For now, store the annotation a new private property:
        alignment._annotations = {}

        # Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        # Query
        # =====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])},
        )
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        # TODO - What if a specific alphabet has been requested?
        # TODO - Use an IUPAC alphabet?
        # TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        # Match
        # =====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])},
        )
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        # This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
    def align_progressive_nj(self, match_score = 1, mismatch_penalty = -1, gap_penalty = -1, extension_penalty = -1, filename = "output.txt"):
        nodes_list = [Node([str(seq[1])]) for seq in self.sequences]

        calculator = DistanceCalculator('blosum62')

        distance_matrix = np.zeros((len(self.sequences), len(self.sequences)))

        for c in combinations(range(len(nodes_list)), 2):
            alignment = pairwise2.align.globalms(nodes_list[c[0]].consensus,
                                                 nodes_list[c[1]].consensus,
                                                 match_score, mismatch_penalty, gap_penalty,
                                                 extension_penalty, one_alignment_only=True)[0]

            aln = MultipleSeqAlignment([SeqIO.SeqRecord(Seq(alignment[0], Gapped(IUPAC.extended_protein, "-")), id="0"),
                                        SeqIO.SeqRecord(Seq(alignment[1], Gapped(IUPAC.extended_protein, "-")),
                                                        id="1")],
                                       Gapped(IUPAC.extended_protein, "-"))
            dm = calculator.get_distance(aln)
            distance_matrix[c[0]][c[1]] = distance_matrix[c[1]][c[0]] = dm[0][1]
        argmin = (0, 1)
        minvalue = distance_matrix[argmin[0], argmin[1]]
        for c in combinations(range(len(nodes_list)), 2):
            if distance_matrix[c[0]][c[1]] < minvalue:
                minvalue = distance_matrix[c[0]][c[1]]
                argmin = c
        print("ARGMIN, MIN", argmin, distance_matrix[argmin[0]][argmin[1]])

        print(distance_matrix)
        while len(nodes_list) > 1:
            argmin = (0, 1)
            minvalue = distance_matrix[argmin[0], argmin[1]]
            for c in combinations(range(len(nodes_list)), 2):
                if distance_matrix[c[0]][c[1]] < minvalue:
                    minvalue = distance_matrix[c[0]][c[1]]
                    argmin = c
            first = argmin[0]
            second = argmin[1]
            newnode = merge_nodes(nodes_list[first], nodes_list[second])
            nodes_list = nodes_list[0:first] + nodes_list[first + 1:second] + nodes_list[second + 1:]
            nodes_list.append(newnode)

            distance_matrix = np.zeros((len(nodes_list), len(nodes_list)))

            for c in combinations(range(len(nodes_list)), 2):
                alignment = pairwise2.align.globalms(nodes_list[c[0]].consensus,
                                                     nodes_list[c[1]].consensus,
                                                     match_score, mismatch_penalty, gap_penalty,
                                                     extension_penalty, one_alignment_only=True)[0]

                aln = MultipleSeqAlignment(
                    [SeqIO.SeqRecord(Seq(alignment[0], Gapped(IUPAC.extended_protein, "-")), id="0"),
                     SeqIO.SeqRecord(Seq(alignment[1], Gapped(IUPAC.extended_protein, "-")), id="1")],
                    Gapped(IUPAC.extended_protein, "-"))
                dm = calculator.get_distance(aln)
                distance_matrix[c[0]][c[1]] = distance_matrix[c[1]][c[0]] = dm[0][1]


        print("ALIGNMENT:")
        for x in nodes_list[0].msa:
            print(str(x))
        score = save_msa_to_file(nodes_list[0].msa, filename)
        return score
Example #5
0
#!/usr/bin/env python
"""Example of generating a substitution matrix from an alignment."""
# standard library
from __future__ import print_function

# Biopython
from Bio import SubsMat
from Bio import AlignIO
from Bio.Alphabet import IUPAC, Gapped
from Bio.Align import AlignInfo

# get an alignment object from a Clustalw alignment output
c_align = AlignIO.read('protein.aln',
                       'clustal',
                       alphabet=Gapped(IUPAC.protein))
summary_align = AlignInfo.SummaryInfo(c_align)

# get a replacement dictionary and accepted replacement matrix
# exclude all amino acids that aren't charged polar
replace_info = summary_align.replacement_dictionary([
    "G", "A", "V", "L", "I", "M", "P", "F", "W", "S", "T", "N", "Q", "Y", "C"
])

my_arm = SubsMat.SeqMat(replace_info)

print(replace_info)

my_lom = SubsMat.make_log_odds_matrix(my_arm)

print('log_odds_mat: %s' % my_lom)
Example #6
0
#http://biopython.org/wiki/AlignIO

#!/usr/bin/env python

from Bio import SeqIO

import os
import sys
from collections import defaultdict
from pprint import pprint
import argparse
import multiprocessing
from Bio.Alphabet import generic_dna, Gapped
from Bio import AlignIO

alignment = AlignIO.read(open(sys.argv[1]),
                         'fasta',
                         alphabet=Gapped(generic_dna))

output = open(sys.argv[2], 'w')

AlignIO.write(alignment, output, "nexus")
Example #7
0
__author__ = 'amirbar'

import os
from Bio.Seq import Seq
from Bio import motifs
from Bio.Alphabet import Gapped, IUPAC
import matplotlib.pyplot as plt
import optparse
import sys
import os

GAP = "-"
ALPHABET = Gapped(IUPAC.unambiguous_dna)


def process_command_line(argv):
    """
	Return a 2-tuple: (settings object, args list).
	`argv` is a list of arguments, or `None` for ``sys.argv[1:]``.
	"""
    if argv is None:
        argv = sys.argv[1:]

    # initialize the parser object:
    parser = optparse.OptionParser(
        formatter=optparse.TitledHelpFormatter(width=100),
        add_help_option=None)

    parser.add_option(
        "-r",
        "--reads_fusion",
    for line in f:
        MSAfilename = line.replace("\n", '')
        MSAfilenames.append(
            MSAfilename)  # get names of MSA files without format

# transform fasta to nex format
for MSAfilename in MSAfilenames:
    file_list.append("speciesID_" + MSAfilename +
                     ".nex")  # get names of MSA files in "nex" format
    with open(MSAfilename + ".fa", "rU") as input_handle, open(
            "speciesID_" + MSAfilename + ".nex",
            "w") as output_handle_nex, open("speciesID_" + MSAfilename + ".fa",
                                            "w") as output_handle_fasta:
        alignments = AlignIO.read(
            input_handle, "fasta",
            alphabet=Gapped(IUPAC.protein))  # read fasta file to "alignments"
        for seq in alignments:
            seq.id = seq.description.split(
                "[")[-1][:-1]  # use species name as sequence ID
            seq.description = ""  # use species name as sequence ID
        AlignIO.write(
            alignments, output_handle_nex,
            "nexus")  # write "alignments" with new ID to nexus format
        AlignIO.write(alignments, output_handle_fasta,
                      "fasta")  # to fasta format

# change a one-gene MSA nex file to a nex obeject, and put them together
nexi = [(fname, Nexus.Nexus(fname)) for fname in file_list]

# combine one-gene MSA nex file of different genes
combined = Nexus.combine(nexi)
Example #9
0
def FastaM10Iterator(handle, alphabet=single_letter_alphabet):
    """Alignment iterator for the FASTA tool's pairwise alignment output.

    This is for reading the pairwise alignments output by Bill Pearson's
    FASTA program when called with the -m 10 command line option for machine
    readable output.  For more details about the FASTA tools, see the website
    http://fasta.bioch.virginia.edu/ and the paper:

         W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448

    This class is intended to be used via the Bio.AlignIO.parse() function
    by specifying the format as "fasta-m10" as shown in the following code:

        from Bio import AlignIO
        handle = ...
        for a in AlignIO.parse(handle, "fasta-m10"):
            assert len(a) == 2, "Should be pairwise!"
            print "Alignment length %i" % a.get_alignment_length()
            for record in a:
                print record.seq, record.name, record.id

    Note that this is not a full blown parser for all the information
    in the FASTA output - for example, most of the header and all of the
    footer is ignored.  Also, the alignments are not batched according to
    the input queries.

    Also note that there can be up to about 30 letters of flanking region
    included in the raw FASTA output as contextual information.  This is NOT
    part of the alignment itself, and is not included in the resulting
    MultipleSeqAlignment objects returned.
    """
    if alphabet is None:
        alphabet = single_letter_alphabet

    state_PREAMBLE = -1
    state_NONE = 0
    state_QUERY_HEADER = 1
    state_ALIGN_HEADER = 2
    state_ALIGN_QUERY = 3
    state_ALIGN_MATCH = 4
    state_ALIGN_CONS = 5

    def build_hsp():
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect", None)
        q = "?"  #Just for printing len(q) in debug below
        m = "?"  #Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                #Quick hack until I can work out how -, * and / characters
                #and the apparent mix of aa and bp coordindates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError, err:
            print "Darn... amino acids vs nucleotide coordinates?"
            print tool
            print query_seq
            print query_tags
            print q, len(q)
            print match_seq
            print match_tags
            print m, len(m)
            print handle.name
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.iteritems():
            alignment._annotations[key] = value
        for key, value in align_tags.iteritems():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Example #10
0
    from Bio.Align import AlignInfo
    from Bio.Align import MultipleSeqAlignment
    from Bio.Alphabet import IUPAC, Gapped
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    os.chdir(sys.argv[1])
    listing = os.listdir(".")
    consensus = {}
    genConsensus = ''
    pssmGen = ''
    consensusThres = 0.7

    #generalAlignment = AlignIO.parse(sys.argv[2],"fasta",alphabet=IUPAC.ExtendedIUPACProtein())
    generalAlignment = AlignIO.parse(sys.argv[2],
                                     "fasta",
                                     alphabet=Gapped(
                                         IUPAC.ExtendedIUPACProtein(), "-"))
    lengthGenAl = 0
    for genAlignment in generalAlignment:
        sumGen = AlignInfo.SummaryInfo(genAlignment)
        genConsensus = sumGen.gap_consensus(consensusThres)
        #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-'])
        pssmGen = sumGen.pos_specific_score_matrix(genConsensus)
        lengthGenAl = len(genAlignment)

    for item in listing:
        if item.endswith(".fas"):
            #alignments = AlignIO.parse(item,"fasta",alphabet=IUPAC.ExtendedIUPACProtein())
            alignments = AlignIO.parse(item,
                                       "fasta",
                                       alphabet=Gapped(
                                           IUPAC.ExtendedIUPACProtein(), "-"))
Example #11
0
from Bio import SeqIO
from Bio.Align import MultipleSeqAlignment
from Bio.Alphabet import IUPAC, Gapped
import time


# Prettify labels
def get_label(leaf):
    if leaf.name.startswith("Inner"):
        return ""
    return leaf.name.replace("_", " ")


# Read the sequences and align

aln = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
for seq_record in SeqIO.parse("data/coding.fa", "fasta"):
    # for seq_record in SeqIO.parse("data/cons_noncode.fa", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))
    aln.extend([seq_record])

# Print the alignment
print(aln)

# Calculate the distance matrix
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(aln)

# Print the distance Matrix
def rename_alignment_taxa(aln, name_map):
    new_align = Alignment([], alphabet=Gapped(IUPAC.unambiguous_dna, "-"))
    for seq in aln:
        seq.id, seq.name = name_map[seq.id], name_map[seq.id]
        new_align.append(seq)
    return new_align
Example #13
0
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage,
                   stars, ngroups, nhaplo):
    """Get haplotypes from contigs in an ace file
    
    """
    marker_number = 0
    min_freq = 0.05
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        with open(out_bamova, "w") as bamova_file:
            output_file.write("Contig_nb\tWindow\tHaplotype\n")
            contig_counter = 0
            ntreated = 0
            for contig in ace_gen:
                pass_haplo = False
                contig_counter += 1
                align = Alignment(Gapped(IUPAC.ambiguous_dna, "X"))
                align.add_sequence(contig.name, contig.sequence)
                if len(contig.reads) - 1 < coverage:
                    continue
                ntreated += 1
                for readn in xrange(len(contig.reads)):
                    clipst = contig.reads[readn].qa.qual_clipping_start
                    clipe = contig.reads[readn].qa.qual_clipping_end
                    clipst2 = contig.reads[readn].qa.align_clipping_start
                    clipe2 = contig.reads[readn].qa.align_clipping_end
                    if clipst2 > clipst:
                        clipst = clipst2
                    if clipe2 < clipe2:
                        clipe = clipe2
                    start = contig.af[readn].padded_start
                    seq = cut_ends(contig.reads[readn].rd.sequence, clipst,
                                   clipe)
                    seq = pad_read(seq, start, len(contig.sequence))
                    if "pseudo" not in contig.reads[readn].rd.name:
                        align.add_sequence(contig.reads[readn].rd.name, seq)
                sequences = read_fasta(align.format("fasta"))
                sequences = [[s[0].replace(">", ""), s[1]] for s in sequences]
                contig_name = sequences[0][0]
                concensus = sequences[0][1]
                error_positions = multi_find("*", concensus)[::-1]
                for p in error_positions:
                    sequences = [[s[0], s[1][0:p] + s[1][p + 1:]]
                                 for s in sequences]
                concensus = sequences[0][1]
                sequences = [[s[0], correct_sequence(concensus, s[1])]
                             for s in sequences[1:]]
                sequences, snp_pos = snp_positions(sequences)
                haplotypes = best_snps(sequences, snp_pos, coverage)
                if haplotypes != "Empty":
                    bamova = []
                    variants = list(
                        sorted(list(set([h[-1] for h in haplotypes[-1]]))))
                    groups = list(
                        sorted(set([h[0][:3] for h in haplotypes[-1]])))
                    if len(groups) >= ngroups:
                        pass_haplo = True
                        for g in groups:
                            if len([
                                    h[0] for h in haplotypes[-1]
                                    if h[0].startswith(g)
                            ]) < nhaplo:
                                pass_haplo = False
                    if pass_haplo:
                        print contig.name
                        bamova_file.write("Marker" + str(marker_number) + "\n")
                        group_number = 0
                        for g in groups:
                            bamova_file.write("Population\t" +
                                              str(group_number))
                            group_number += 1
                            for v in variants:
                                bamova_file.write("\t" + str(
                                    len([
                                        h for h in haplotypes[-1]
                                        if h[-1] == v and h[0].startswith(g)
                                    ])))
                            bamova_file.write("\n")
                        with open("fasta_output/" + contig.name + ".fasta",
                                  "w") as f:
                            output_file.write(contig.name + "\n")
                            for h in haplotypes[-1]:
                                f.write(">" + h[0] + str(marker_number) +
                                        "\n" + h[2] + "\n")
                                h[1] = [x - h[1][0] + 1 for x in h[1]]
                                output_file.write(
                                    "Marker" + str(marker_number) + "\t" +
                                    "\t".join([str(x) for x in h]) + "\t" +
                                    ":".join(variants) + "\n")
                        marker_number += 1
                output_file.flush()
                bamova_file.flush()
                cutoff = 100000
                if contig_counter > cutoff:
                    break
        print "\n", str(ntreated), "contigs out of", str(
            contig_counter), "were treated"
def main():
    # Configuration
    #Select the desired NCBI translation table
    translationTable = 11

    # Open the DNA sequence file and read the fasta sequences into a dictionary
    if (len(argv) > 1):
        dnaFileName = argv[1]
    else:
        dnaFileName = None
    dnaSeqFile = fileinput.input(dnaFileName)
    dnaSeqDict = SeqIO.to_dict(SeqIO.parse(dnaSeqFile, "fasta"))

    # Translate the sequences
    aaSeqRecords = []
    for key in dnaSeqDict:
        aaSeq = SeqRecord(dnaSeqDict[key].seq.translate(table=translationTable), id=key)
        aaSeqRecords.append(aaSeq)
    dnaSeqFile.close()

    # Replace stop codons with X (unknown aa) so muscle doesn't drop them
    for aaSeq in aaSeqRecords:
        noStopCodonSeq = str(aaSeq.seq).replace('*', 'X')
        aaSeq.seq = Seq(noStopCodonSeq)

    # Align the aa sequences
    commandLine = str(MuscleCommandline(seqtype='protein'))
    childProcess = subprocess.Popen(commandLine, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=(sys.platform!="win32")) #don't pipe stderr or muscle hangs
    SeqIO.write(aaSeqRecords, childProcess.stdin, "fasta")
    childProcess.stdin.close()
    aaAlignment = AlignIO.read(childProcess.stdout, "fasta")

    # Convert the aa alignment into a dna alignment
    dnaAlignment = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
    for taxon in aaAlignment:
        aaCount = 0
        dnaSeq = ''
        for aaResidue in taxon.seq:
            if (aaResidue == '-'):
                dnaSeq = dnaSeq + '---'
            else:
                dnaSeq = dnaSeq + dnaSeqDict[taxon.id].seq[aaCount*3:aaCount*3+3]
                aaCount+=1
        # As we add the sequences to the alignment remove gene name from the sequence id so they taxon match the PAML constraint tree
        dnaAlignment.add_sequence(taxon.id.split('_')[0], str(dnaSeq))
    if (dnaFileName):
        outFileName = dnaFileName.split('.')[0] + '_aln.phy'
    else:
        outFileName = 'out_aln.phy'
    outFile = open(outFileName, 'w+')
    AlignIO.write([dnaAlignment], outFile, "phylip")

#I think this section should be removed.  If I put the 'I' into the alignment file now, I can't open the alignment with BioPython-based scripts (for manual editing etc).  I can use pamlize.py to add the I right before using paml.
    # Biopython doesn't tag Interleaved phylip files and PAML requires it so...
#    outFile.seek(0,0)
#    modifiedAlignmentText = outFile.readlines()
#    modifiedAlignmentText[0] = modifiedAlignmentText[0].rstrip() + ' I\n'
#    outFile.seek(0,0)
#    outFile.writelines(modifiedAlignmentText)

    outFile.close()
Example #15
0
#join all snps into one dictionary
final_snp_alignment = {}
if snp_mode == 'one':
	for key, value in final_dict.items():
		final_snp_alignment[key] = "".join(value)
elif snp_mode == 'all':
	for key, values in final_dict.items():
		value = sum(values, [])
		final_snp_alignment[key] = "".join(value)

# Create the output file in output directory
output_file_fasta = os.path.join(out_dir,'snp.fasta')
#print the snp dictionary into a fasta-file
with open(output_file_fasta, "wb") as f:
	for k, v in final_snp_alignment.items():
		f.write(">" + k+ "\n")
		f.write(v+ "\n")

# Create output file for SNAPP
output_file_nexus = os.path.join(out_dir,'snp.nexus')
aln = AlignIO.read(open(output_file_fasta), "fasta", alphabet=Gapped(IUPAC.ambiguous_dna))
with open(output_file_nexus, "wb") as n:
	n.write(aln.format("nexus"))
if not args.phased:
	for line in fileinput.input(output_file_nexus, inplace = 1):
		print line.replace("format datatype=dna missing=? gap=-;", "format datatype=binary symbols=01 missing=?;").rstrip()
else:
	for line in fileinput.input(output_file_nexus, inplace = 1):
		print line.replace("format datatype=dna missing=? gap=-;", "format datatype=integerdata symbols=\"012\" missing=?;").rstrip()
Example #16
0
def getKmers(k, interval, outdir, msaFile, tp_prot_file, modelName, start, end,
             gene_pos_file, gene_pos_file_aa):
    pprot_TP_dict = {}
    for record in SeqIO.parse(tp_prot_file, "fasta"):
        pprot_TP_dict[record.id] = str(record.seq)

    alignment = AlignIO.read(msaFile, "fasta")
    if (start != None) & (end != None):
        alignment = alignment[:, start - 1:end - 1]
    print("Number of domains: %i" % len(alignment))
    print("Alignment length: %i" % alignment.get_alignment_length())
    hmmDict = {}
    counter = int(((alignment.get_alignment_length() - k) / interval) + 1)
    j = 0

    for i in range(alignment.get_alignment_length()):
        alnCol = alignment[:, i]
        if '-' in alnCol:
            j = j + 1
        else:
            break

    seqCtr = alignment.get_alignment_length()
    for i in range(alignment.get_alignment_length() - 1, -1, -1):
        alnCol = alignment[:, i]
        if '-' in alnCol:
            seqCtr = seqCtr - 1
        else:
            break
    gene_pos_out_aa = open(gene_pos_file_aa, 'w')
    gene_pos_out_aa.write("gene_name\tstart\tend\tinterval\tprot_type\n")
    gene_pos_out = open(gene_pos_file, 'w')
    gene_pos_out.write("gene_name\tstart\tend\tinterval\tprot_type\n")
    for i in range(counter):
        startPos = j
        endPos = j + k
        if endPos <= seqCtr:
            kmer = alignment[:, startPos:
                             endPos]  #[ rows (different domains),columns (Amino Acids)]
            spHMMAlign = MultipleSeqAlignment([],
                                              Gapped(IUPAC.extended_protein,
                                                     "-"))

            if str(kmer[0].seq).count("-") <= 15:
                # Remove the TP genes
                for align in kmer:
                    if align.id not in pprot_TP_dict:
                        spHMMAlign.append(align)
                    else:
                        prot_seq = str(align.seq)
                        prot_seq = prot_seq.replace("-", "")
                        start_tp_coord = pprot_TP_dict[align.id].find(prot_seq)
                        end_tp_coord = start_tp_coord + len(prot_seq)
                        gene_pos_out_aa.write(align.id + "\t" +
                                              str(start_tp_coord) + "\t" +
                                              str(end_tp_coord) + "\t" +
                                              str(i * interval) + "_" +
                                              str(i * interval + k) + "\t" +
                                              modelName + "\n")
                        gene_pos_out.write(align.id + "\t" +
                                           str(start_tp_coord * 3) + "\t" +
                                           str(end_tp_coord * 3) + "\t" +
                                           str(i * interval) + "_" +
                                           str(i * interval + k) + "\t" +
                                           modelName + "\n")

                outputFile = outdir + os.sep + modelName + "__" + str(
                    k) + "_" + str(interval) + "__" + str(
                        i * interval) + "_" + str(i * interval + k) + ".fas"
                AlignIO.write(spHMMAlign, outputFile, "fasta")
                hmmFile = runHMMBuild(outputFile, modelName)
                hmmSegment = str(startPos) + "_" + str(endPos)
                hmmDict[hmmSegment] = HMMFile(i * interval, i * interval + k,
                                              hmmFile)

            j = j + interval
        else:
            break
    gene_pos_out_aa.close()
    gene_pos_out.close()
    return hmmDict
Example #17
0
import os

import Bio
from Bio.Alphabet import generic_dna
from Bio import motifs
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Alphabet import IUPAC, Gapped
from Bio.Align.Applications import ClustalwCommandline
from Bio.Align.Applications import ClustalwCommandline
from Bio.SubsMat import FreqTable

alph = Gapped(IUPAC.ambiguous_dna)


def printAlignmentInfo(alignment, alphabet):
    seqlist = []
    for record in alignment:
        seqlist.append(record.seq)

    m = motifs.create(seqlist, alphabet)
    pwm = m.counts.normalize()
    consensus = pwm.consensus

    summary_align = AlignInfo.SummaryInfo(alignment)

    consensus2 = summary_align.dumb_consensus()
    my_pssm = summary_align.pos_specific_score_matrix(consensus,
def convert_file(in_file, out_file):
    alignment = AlignIO.read(open(in_file),
                             "fasta",
                             alphabet=Gapped(IUPAC.protein))
    g = open(out_file, "w")
    g.write(alignment.format("nexus"))
Example #19
0
    def next(self):
        """Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an MultipleSeqAlignment object containing two rows.
        """
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            raise StopIteration

        if line.startswith("#"):
            #Skip the file header before the alignments.  e.g.
            line = self._skip_file_header(line)
        while ">>>" in line and not line.startswith(">>>"):
            #Moved onto the next query sequence!
            self._query_descr = ""
            self._query_header_annotation = {}
            #Read in the query header
            line = self._parse_query_header(line)
            #Now should be some alignments, but if not we move onto the next query
        if not line:
            #End of file
            raise StopIteration
        if ">>><<<" in line:
            #Reached the end of the alignments, no need to read the footer...
            raise StopIteration

        #Should start >>... and not >>>...
        assert line[0:2] == ">>" and not line[2] == ">", line

        query_seq_parts, match_seq_parts = [], []
        query_annotation, match_annotation = {}, {}
        match_descr = ""
        alignment_annotation = {}

        #This should be followed by the target match ID line, then more tags.
        #e.g.
        """
        >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
        ; fa_frame: f
        ; fa_initn:  52
        ; fa_init1:  52
        ; fa_opt:  70
        ; fa_z-score: 105.5
        ; fa_bits: 27.5
        ; fa_expect:  0.082
        ; sw_score: 70
        ; sw_ident: 0.279
        ; sw_sim: 0.651
        ; sw_overlap: 43
        """
        if (not line[0:2] == ">>") or line[0:3] == ">>>":
            raise ValueError("Expected target line starting '>>'")
        match_descr = line[2:].strip()
        #Handle the following "alignment hit" tagged data, e.g.
        line = handle.readline()
        line = self._parse_tag_section(line, alignment_annotation)
        assert not line[0:2] == "; "

        #Then we have the alignment numbers and sequence for the query
        """
        >gi|10955265| ..
        ; sq_len: 346
        ; sq_offset: 1
        ; sq_type: p
        ; al_start: 197
        ; al_stop: 238
        ; al_display_start: 167
        DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
        QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
        GEYFTENKPKYIIREIHQET
        """
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..'")
        assert self._query_descr.startswith(line[1:].split(None, 1)[0])

        #Handle the following "query alignment" tagged data
        line = handle.readline()
        line = self._parse_tag_section(line, query_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence (with leading flanking region)
        while not line[0] == ">":
            query_seq_parts.append(line.strip())
            line = handle.readline()

        #Handle the following "match alignment" data
        """
        >gi|152973545|ref|YP_001338596.1| ..
        ; sq_len: 242
        ; sq_type: p
        ; al_start: 52
        ; al_stop: 94
        ; al_display_start: 22
        IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
        RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
        QDFAFTRKMRREARQVEQSW
        """
        #Match identifier
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError(
                "Expected line starting '>' and ending '..', got '%s'" %
                repr(line))
        assert match_descr.startswith(line[1:].split(None, 1)[0])

        #Tagged data,
        line = handle.readline()
        line = self._parse_tag_section(line, match_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence with flanking region...
        #but before that, since FASTA 35.4.1 there can be an consensus here,
        """
        ; al_cons:
        .::. : :. ---.  :: :. . :  ..-:::-:  :.:  ..:...: 
        etc
        """
        while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
            match_seq_parts.append(line.strip())
            line = handle.readline()
        if line[0:2] == "; ":
            assert line.strip() == "; al_cons:"
            align_consensus_parts = []
            line = handle.readline()
            while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
                align_consensus_parts.append(line.strip())
                line = handle.readline()
            #If we do anything with this in future, must remove any flanking region.
            align_consensus = "".join(align_consensus_parts)
            del align_consensus_parts
            assert not line[0:2] == "; "
        else:
            align_consensus = None
        assert (line[0] == ">" or ">>>" in line)
        self._header = line

        #We built a list of strings and then joined them because
        #its faster than appending to a string.
        query_seq = "".join(query_seq_parts)
        match_seq = "".join(match_seq_parts)
        del query_seq_parts, match_seq_parts
        #Note, query_seq and match_seq will usually be of different lengths, apparently
        #because in the m10 format leading gaps are added but not trailing gaps!

        #Remove the flanking regions,
        query_align_seq = self._extract_alignment_region(
            query_seq, query_annotation)
        match_align_seq = self._extract_alignment_region(
            match_seq, match_annotation)
        #How can we do this for the (optional) consensus?

        #The "sq_offset" values can be specified with the -X command line option.
        #They appear to just shift the origin used in the calculation of the coordinates.

        if len(query_align_seq) != len(match_align_seq):
            raise ValueError(
                "Problem parsing the alignment sequence coordinates, "
                "following should be the same length but are not:\n"
                "%s - len %i\n%s - len %i" %
                (query_align_seq, len(query_align_seq), match_align_seq,
                 len(match_align_seq)))
        if "sw_overlap" in alignment_annotation:
            if int(alignment_annotation["sw_overlap"]) != len(query_align_seq):
                raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
                                 % (alignment_annotation["sw_overlap"],
                                    len(query_align_seq)))

        #TODO - Look at the "sq_type" to assign a sensible alphabet?
        alphabet = self.alphabet
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in self._query_header_annotation.iteritems():
            alignment._annotations[key] = value
        for key, value in alignment_annotation.iteritems():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(
            Seq(query_align_seq, alphabet),
            id=self._query_descr.split(None, 1)[0].strip(","),
            name="query",
            description=self._query_descr,
            annotations={"original_length": int(query_annotation["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_annotation["al_start"])
        record._al_stop = int(query_annotation["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_annotation:
            if query_annotation["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_annotation["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in query_align_seq:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(
            Seq(match_align_seq, alphabet),
            id=match_descr.split(None, 1)[0].strip(","),
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_annotation["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_annotation["al_start"])
        record._al_stop = int(match_annotation["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_annotation:
            if match_annotation["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_annotation["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in match_align_seq:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Example #20
0
    True
    >>> _match_ambiguous_dna('A', 'T')
    False
    >>> _match_ambiguous_dna('A', 'A')
    True
    """
    x = x.upper()
    y = y.upper()
    xset = set(ambiguous_dna_values.get(x, x))
    yset = set(ambiguous_dna_values.get(y, y))
    if not xset.intersection(yset):
        return False
    return True


DNA_ALPHABET = alphabet = Gapped(ambiguous_dna, '-')
DNA_ALPHABET.match = lambda x, y: _match_ambiguous_dna(x, y)

FLAGS = MavisNamespace(LQ='LOWQUAL')

READ_PAIR_TYPE = MavisNamespace(RR='RR', LL='LL', RL='RL', LR='LR')

CALL_METHOD = MavisNamespace(CONTIG='contig',
                             SPLIT='split reads',
                             FLANK='flanking reads',
                             SPAN='spanning reads',
                             INPUT='input')
""":class:`MavisNamespace`: holds controlled vocabulary for allowed call methods

- ``CONTIG``: a contig was assembled and aligned across the breakpoints
- ``SPLIT``: the event was called by :term:`split read`
 def compute_consensus(self):
     align = MultipleSeqAlignment(Gapped(IUPAC.extended_protein, "-"))
     for i, seq in enumerate(self.msa):
         align.add_sequence(str(i), str(seq))
     summary_align = AlignInfo.SummaryInfo(align)
     self.consensus = summary_align.gap_consensus(threshold=0, ambiguous="-")
    print '\t\t\t<alignment idref="alignment"/>'
    print '\t\t\t<counts>'
    print '\t\t\t\t<parameter value="', constants['A'], constants[
        'C'], constants['G'], constants['T'], '"/>'
    print '\t\t\t</counts>'
    print '\t\t</constantPatterns>'
    print '\t</mergePatterns>'

    print '\nOr use replace_BEAST_blocks.py and provide the file', options.outfile + ".patterns", "with the -p flag"

    output = open(options.outfile + ".patterns", "w")
    print >> output, ' '.join(
        map(str,
            [constants['A'], constants['C'], constants['G'], constants['T']]))
    output.close()

    alignment = Generic.Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
    for name in snpsequence:

        #		if len(''.join(snpsequence[name]).replace("-","").replace("N",""))>float(len(snpsequence[name]))*(float(options.exclude)/100):
        #			alignment.add_sequence(name, ''.join(snpsequence[name]))
        #		else:
        #			print name, "excluded from snp alignment as it is < "+str(options.exclude)+"% mapped"
        if name in dates:
            alignment.add_sequence(name + "_" + str(dates[name]),
                                   ''.join(snpsequence[name]))
        else:
            alignment.add_sequence(name, ''.join(snpsequence[name]))

    AlignIO.write([alignment], open(options.outfile, 'w'), "fasta")
Example #23
0
def AceIterator(handle):
    """Returns SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags.

    Ace files include the base quality for each position, which are taken
    to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
    using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
    letter_annotations dictionary under the "phred_quality" key.

    >>> from Bio import SeqIO
    >>> with open("Ace/consed_sample.ace", "rU") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s %s... %i" % (record.id, record.seq[:10], len(record)))
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 agccccgggc... 1475
    90

    However, ACE files do not include a base quality for any gaps in the
    consensus sequence, and these are represented in Biopython with a quality
    of zero. Using zero is perhaps misleading as there may be very strong
    evidence to support the gap in the consensus. Previous versions of
    Biopython therefore used None instead, but this complicated usage, and
    prevented output of the gapped sequence as FASTQ format.

    >>> from Bio import SeqIO
    >>> with open("Ace/contig1.ace", "rU") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s ...%s..." % (record.id, record.seq[85:95]))
    ...         print(record.letter_annotations["phred_quality"][85:95])
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 ...AGAGG-ATGC...
    [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
    90
    Contig2 ...GAATTACTAT...
    [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
    90

    """
    for ace_contig in Ace.parse(handle):
        # Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        # Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str:
            if "T" in consensus_seq_str:
                # Very odd! Error?
                alpha = generic_nucleotide
            else:
                alpha = generic_rna
        else:
            alpha = generic_dna

        if "*" in consensus_seq_str:
            # For consistency with most other file formats, map
            # any * gaps into - gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*", "-"),
                                Gapped(alpha, gap_char="-"))
        else:
            consensus_seq = Seq(consensus_seq_str, alpha)

        # TODO? - Base segments (BS lines) which indicates which read
        # phrap has chosen to be the consensus at a particular position.
        # Perhaps as SeqFeature objects?

        # TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        # Perhaps as SeqFeature objects?

        seq_record = SeqRecord(consensus_seq,
                               id=ace_contig.name,
                               name=ace_contig.name)

        # Consensus base quality (BQ lines).  Note that any gaps (originally
        # as * characters) in the consensus do not get a quality entry, so
        # we assign a quality of None (zero would be missleading as there may
        # be excelent support for having a gap here).
        quals = []
        i = 0
        for base in consensus_seq:
            if base == "-":
                quals.append(0)
            else:
                quals.append(ace_contig.quality[i])
                i += 1
        assert i == len(ace_contig.quality)
        seq_record.letter_annotations["phred_quality"] = quals

        yield seq_record
 def stage_two_trimming(self, s1_trimmed, window_size, max_divergence,
                        min_len):
     """
     Alignment row-by-row trimming.  After stage one trimming, iterate
     over rows of alignment to find differences between the alignment
     consensus and the row (taxon) of data.  Trim those ends that differ
     from the consensus with > `divergence` across a `window_size` window.
     Goes to third round of filtering to remove edges that end up with only '----'
     characters to start or end alignment block.
     """
     # create new alignment object to hold trimmed alignment
     s2_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna,
                                                  "-?"))
     # get consensus of alignment in array form
     consensus_array = numpy.array(
         list(self._alignment_consensus(s1_trimmed)))
     # iterate over each alignment sequence
     for sequence in s1_trimmed:
         # ensure sequence is uppercase - consensus will be, too
         sequence = sequence.upper()
         # get the true ends of the sequence by walking in until we hit some bases
         start, end = self._get_ends(sequence)
         # convert sequence to array
         orig_seq_array = numpy.array(list(sequence))
         # trim down gaps at edges so they do not exert undue influence
         # on trimming the sequence row
         seq_array = orig_seq_array[start:end]
         # set default values for trim to `start` and `end`, just for safety
         # this ensure we don't carry anything over from previous iteration
         # (we shouldn't)
         bad_start = 0
         bad_end = len(sequence)
         # =============================================================
         # get first 5' => 3' positions that start a `window_size` block
         # of sequence having a divergence of less than `max_divergence`
         # from the consensus sequence of all alignments
         # =============================================================
         # compare the sequence to the consensus, returns an array of
         # boolean values representing equality relative to the consensus
         compare = (seq_array != consensus_array[start:end])
         # begin working from 5' => 3' across `compare` array
         for bad_start in xrange(compare.size):
             # get successive window-sized slices
             window = compare[bad_start:bad_start + window_size]
             divergence = float(sum(window)) / window.size
             # stop if we hit a point where divergence < max_divergence
             if divergence < max_divergence:
                 break
         # reverse the `compare` array and begin working 3' => 5'
         reversed_compare = compare[::-1]
         for bad_end in xrange(reversed_compare.size):
             window = reversed_compare[bad_end:bad_end + window_size]
             divergence = float(sum(window)) / window.size
             # get 5 value slices
             if divergence < max_divergence:
                 bad_end = reversed_compare.size - bad_end
                 break
         # given original edge trimming and `bad_start`/`bad_end` values,
         # set the starting values of the sequece array to '-'
         orig_seq_array[:start + bad_start] = '-'
         orig_seq_array[start + bad_end:] = '-'
         trim = ''.join(orig_seq_array)
         # ensure alignment consists of something other than '-' or '?'
         # and that alignments are >= min_len
         if set(trim) != set(
             ['-']) and set(trim) != (['?']) and len(trim) >= min_len:
             s2_trimmed.append(self._record_formatter(trim, sequence.id))
         # if they're not, return None
         else:
             s2_trimmed = None
             break
     return s2_trimmed
Example #25
0
#!/usr/bin/env python

from Bio import AlignIO
from Bio.Alphabet import IUPAC, Gapped
import sys

#This script takes a FASTA alignment and converts is to a
#nexus alignment

# check for correct arguments
if len(sys.argv) != 3:
    print("Usage: FastaToNexus.py <inputfile> <outputfile>")
    sys.exit(0)

input_name = sys.argv[1]
output_name = sys.argv[2]

input_file = open(input_name, 'r')
output_file = open(output_name, 'w')

alignment = AlignIO.read(input_file,
                         'fasta',
                         alphabet=Gapped(IUPAC.ambiguous_dna, '-'))
AlignIO.write(alignment, output_file, 'nexus')

input_file.close()
output_file.close()
Example #26
0
 def __init__(self):
     Alignment.__init__(self, Gapped(IUPAC.unambiguous_dna, '-'))
Example #27
0
#! /usr/bin/env python
'''
'''
import sys
from Bio import AlignIO
from Bio.Alphabet import IUPAC, Gapped

input_handle = open(sys.argv[1], "rU")
output_handle = open(sys.argv[2], "w")

alignments = AlignIO.parse(input_handle,
                           "fasta",
                           alphabet=Gapped(IUPAC.unambiguous_dna))
AlignIO.write(alignments, output_handle, "nexus")

output_handle.close()
input_handle.close()
Example #28
0
 def get_aa(seq):
     seq = Seq(seq, Gapped(IUPAC.unambiguous_dna))
     seq = seq.translate(table=1, to_stop=False)
     return str(seq)
Example #29
0
 def _record_formatter(self, trim, name):
     """return a string formatted as a biopython sequence record"""
     return SeqRecord(Seq(trim, Gapped(IUPAC.ambiguous_dna, "-?")),
                      id=name,
                      name=name,
                      description=name)
def replace_outgroup_with_gap(seq_directory, outgroup_path, window_size = 20, Max_p_sites_o = 8):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H",
                   "D", "B"]

    ### input directory from s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory_1 = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    output_directory_2 = output_directory_1.replace("/s1_rm_polymorphism_sites/","/s2_rm_polymorphism_in_outgroups/")

    if os.path.isdir(output_directory_2) == False:
        os.makedirs(output_directory_2)

    ### iterate each gene
    for file in os.listdir(output_directory_1):
        if file != ".DS_Store":
            output_directory_file = output_directory_2 + file
            fasta_name = output_directory_1 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " + sequence)

                alignment = AlignIO.read(sequence, 'fasta')

                ### calculate the polymorphism in outgroup
                ### change alignment to an array.
                total_wrong_poly_sites_outgroup = []

                align_array_outgroup = np.array([list(rec) for rec in alignment])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = alignment.get_alignment_length()
                # alignment = AlignIO.read(sequence, 'fasta')
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position_outgroup = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array_outgroup[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position_outgroup.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position_outgroup.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 8 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position_outgroup) > float(Max_p_sites_o):
                        print(column_position_outgroup)
                        total_wrong_poly_sites_outgroup = total_wrong_poly_sites_outgroup + column_position_outgroup


                unique_wrong_sites_ougroup = list(np.unique(total_wrong_poly_sites_outgroup))
                print(unique_wrong_sites_ougroup)
                print("outgroup")


                align_2 = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
                for record in alignment:
                    new_seq = ""

                    if record.id in outgroups:
                        print(record.seq)
                        for i in range(total_length):
                            if i in unique_wrong_sites_ougroup:
                                new_seq = new_seq + "-"
                            else:
                                new_seq = new_seq + str(record.seq[i])

                        align_2.add_sequence(str(record.id), str(new_seq))

                    else:
                        align_2.add_sequence(str(record.id), str(record.seq))

                print(align_2)

                AlignIO.write(align_2, output_directory_file, "fasta")