예제 #1
0
def translate(s):
    '''
    Assume we are in frame and translate DNA to amino acids.
    '''
    coding_dna = Seq(s[:(3 * int(len(s) / 3))], Gapped(IUPAC.ambiguous_dna))
    return str(coding_dna.translate())
예제 #2
0
from Bio.Align import AlignInfo
from Bio.SubsMat import FreqTable

# create the command line to run clustalw
# this assumes you've got clustalw somewhere on your path, otherwise
# you need to pass the full path of the executable to this via cmd="..."
cline = ClustalwCommandline(infile='opuntia.fasta', outfile='test.aln')

# actually perform the alignment
return_code = subprocess.call(str(cline), shell=(sys.platform != "win32"))
assert return_code == 0, "Calling ClustalW failed"

# Parse the output
alignment = AlignIO.read("test.aln",
                         "clustal",
                         alphabet=Gapped(IUPAC.unambiguous_dna))

print alignment

print 'first description:', alignment[0].description
print 'first sequence:', alignment[0].seq

# get the length of the alignment
print 'length', alignment.get_alignment_length()

print alignment

# print out interesting information about the alignment
summary_align = AlignInfo.SummaryInfo(alignment)

consensus = summary_align.dumb_consensus()
예제 #3
0
 def __init__(self):
     Alignment.__init__(self, Gapped(IUPAC.unambiguous_dna, '-'))
예제 #4
0
from Bio.Alphabet import Gapped, SingleLetterAlphabet
from Bio.Seq import Seq

SPACES = ["-", ".", " ", "~"]
SPACE = SPACES[0]
MSF_SPACE = SPACES[1]
MSF_TERMINAL_SPACE = SPACES[3]

GAPPED_ALPHABET = Gapped(SingleLetterAlphabet(), SPACE)


class EmptySeq(Seq):
    def __init__(self):
        super(EmptySeq, self).__init__(SPACE, GAPPED_ALPHABET)

    def __len__(self):
        return 0

    def __asseq(self):
        return Seq(str(self), self.alphabet)

    def __add__(self, other):
        return self.__asseq().__add__(other)

    def __radd__(self, other):
        return self.__asseq().__radd__(other)


EMPTY_SEQ = EmptySeq()
예제 #5
0
#!/usr/bin/env python

from Bio import AlignIO
from Bio.Alphabet import IUPAC, Gapped
import sys

#This script takes a FASTA alignment and converts is to a
#phylip sequential alignment

# check for correct arguments
if len(sys.argv) != 3:
    print("Usage: FastaToPhylip.py <inputfile> <outputfile>")
    sys.exit(0)

input_name = sys.argv[1]
output_name = sys.argv[2]

input_file = open(input_name, 'r')
output_file = open(output_name, 'w')

alignment = AlignIO.read(input_file,
                         'fasta',
                         alphabet=Gapped(IUPAC.ambiguous_dna, '-'))
AlignIO.write(alignment, output_file, 'phylip-sequential')

input_file.close()
output_file.close()
예제 #6
0
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.SubsMat import FreqTable

# create the command line to run clustalw
# this assumes you've got clustalw somewhere on your path, otherwise
# you need to pass the full path of the executable to this via cmd="..."
cline = ClustalwCommandline(infile="opuntia.fasta", outfile="test.aln")

# actually perform the alignment
return_code = subprocess.call(str(cline), shell=(sys.platform != "win32"))
assert return_code == 0, "Calling ClustalW failed"

# Parse the output
alignment = AlignIO.read("test.aln", "clustal", alphabet=Gapped(IUPAC.unambiguous_dna))

print(alignment)

print("first description: %s" % alignment[0].description)
print("first sequence: %s" % alignment[0].seq)

# get the length of the alignment
print("length %i" % alignment.get_alignment_length())

print(alignment)

# print out interesting information about the alignment
summary_align = AlignInfo.SummaryInfo(alignment)

consensus = summary_align.dumb_consensus()
 def trim_alignment(self,
                    method='edges',
                    remove_probe=None,
                    bases=None,
                    consensus=True,
                    window_size=20,
                    threshold=0.5):
     """Trim the alignment"""
     if method == 'edges':
         # find edges of the alignment
         start = self._find_ends(forward=True)
         end = self._find_ends(forward=False)
     elif method == 'running':
         start, end = self.running_average(window_size, threshold)
     elif method == 'running-probe':
         # get position of probe
         for k, v in enumerate(self.alignment):
             if v.name == 'probe':
                 break
             else:
                 pass
         start, end = self.running_average(window_size, threshold, k, True)
     #pdb.set_trace()
     if method == 'notrim':
         self.trimmed_alignment = self.alignment
     else:
         # create a new alignment object to hold our alignment
         self.trimmed_alignment = Alignment(Gapped(IUPAC.ambiguous_dna,
                                                   "-"))
         for sequence in self.alignment:
             # ignore the probe sequence we added
             if (method == 'edges' or method == 'running'
                     or method == 'running-probe') and not remove_probe:
                 # it is totally retarded that biopython only gives us the option to
                 # pass the Alignment object a name and str(sequence).  Given this
                 # level of retardation, we'll fudge and use their private method
                 if start >= 0 and end:
                     self.trimmed_alignment._records.append(
                         sequence[start:end])
                 else:
                     self.trimmed_alignment = None
                     break
             elif method == 'static' and not remove_probe and bases:
                 # get middle of alignment and trim out from that - there's a
                 # weakness here in that we are not actually locating the probe
                 # region, we're just locating the middle of the alignment
                 mid_point = len(sequence) / 2
                 if self._base_checker(bases, sequence, mid_point):
                     self.trimmed_alignment._records.append(
                         sequence[mid_point - bases:mid_point + bases])
                 else:
                     self.trimmed_alignment = None
             elif method == 'static' and not remove_probe and bases and self.ploc:
                 # get middle of alignment and trim out from that - there's a
                 # weakness here in that we are not actually locating the probe
                 # region, we're just locating the middle of the alignment
                 if self._base_checker(bases, sequence, self.ploc):
                     self.trimmed_alignment._records.append(
                         sequence[self.ploc[0] - bases:self.ploc[1] +
                                  bases])
                 else:
                     self.trimmed_alignment = None
             elif remove_probe and self.ploc:
                 # we have to drop to sequence level to add sequence slices
                 # where we basically slice around the probes location
                 temp = sequence.seq[:self.ploc[0]] + sequence.seq[self.
                                                                   ploc[1]:]
                 self.trimmed_alignment._records.append( \
                     self._record_formatter(temp)
                     )
             elif method == 'static' and remove_probe and bases and self.ploc:
                 if self._base_checker(bases, sequence, self.ploc):
                     temp = sequence.seq[self.ploc[0]-bases:self.ploc[0]] + \
                         sequence.seq[self.ploc[1]:self.ploc[1]+bases]
                     self.trimmed_alignment._records.append( \
                         self._record_formatter(temp)
                         )
                 else:
                     self.trimmed_alignment = None
     # build a dumb consensus
     if consensus and self.trimmed_alignment:
         self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \
             self._alignment_summary(self.trimmed_alignment)
     if not self.trimmed_alignment:
         print "\tAlignment {0} dropped due to trimming".format(
             self.alignment._records[0].description.split('|')[1])
예제 #8
0
def rm_wrong_polymorphism_sites(seq_directory, outgroup_path, window_size = 20, Max_p_sites = 4):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"]

    ### input files are from s6
    genes_result_s6 = seq_directory.replace("s1_Gene/", "s6_trimal/")

    ### mkdir output directory for s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    if os.path.isdir(output_directory) == False:
        os.makedirs(output_directory)

    ### iterate each gene
    for file in os.listdir(genes_result_s6):
        if file != ".DS_Store":
            output_directory_file = output_directory + file
            fasta_name = genes_result_s6 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " +sequence)

                alignment = AlignIO.read(sequence, 'fasta')
                # print(alignment)

                ### generate a new alignment sequences without outgroups.
                align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))

                for record in alignment:
                    if record.id not in outgroups:
                        # print(record.id)
                        # print(record.seq)
                        align.add_sequence(str(record.id), str(record.seq))


                print(align)
                # print(align.get_alignment_length())


                total_wrong_poly_sites = []
                ### change alignment to an array.
                align_array = np.array([list(rec) for rec in align])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = align.get_alignment_length()



                ### using 20bp-long sliding windows.
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 4 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position) > float(Max_p_sites):
                        print(column_position)
                        total_wrong_poly_sites = total_wrong_poly_sites + column_position

                #print(total_wrong_poly_sites)

                ### generate the unique positions

                total_wrong_poly_sites = total_wrong_poly_sites + list(range(10))
                total_wrong_poly_sites = total_wrong_poly_sites + list(range(total_length-10, total_length))
                ### extract the polymorphic sites from alignment data, might be useful for delete the first 2 species.
                unique_wrong_sites = list(np.unique(total_wrong_poly_sites))
                print(len(unique_wrong_sites))
                # sum2 = alignment[:, total_length:total_length + 1]
                # for i in unique_wrong_sites:
                #     sum2 = sum2 + alignment[:, i:i+1]
                # print(sum2)
                # SeqIO.write(sum2, "/Users/zhouwenbin/Downloads/result/M40_total.phy", "phylip")


                ### operating: if any window has more than 3 polymorphic sites, use trimal to remove those sites.
                ### otherwise, copy the gene to the new folder.
                if len(unique_wrong_sites) > 0:

                    print(str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}"))

                    cmd_selected_col = str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}")

                    cmd = "trimal -in " + fasta_name + " -out " + output_directory_file + " -selectcols " + cmd_selected_col

                    print(cmd)
                    os.system(cmd)

                else:
                    cmd_2 = "cp " + fasta_name + " " + output_directory_file
                    print(cmd_2)
                    os.system(cmd_2)
예제 #9
0
    True
    >>> _match_ambiguous_dna('A', 'T')
    False
    >>> _match_ambiguous_dna('A', 'A')
    True
    """
    x = x.upper()
    y = y.upper()
    xset = set(ambiguous_dna_values.get(x, x))
    yset = set(ambiguous_dna_values.get(y, y))
    if not xset.intersection(yset):
        return False
    return True


DNA_ALPHABET = alphabet = Gapped(ambiguous_dna, '-')
DNA_ALPHABET.match = lambda x, y: _match_ambiguous_dna(x, y)

FLAGS = MavisNamespace(LQ='LOWQUAL')

READ_PAIR_TYPE = MavisNamespace(RR='RR', LL='LL', RL='RL', LR='LR')

CALL_METHOD = MavisNamespace(CONTIG='contig', SPLIT='split reads', FLANK='flanking reads', SPAN='spanning reads', INPUT='input')
""":class:`MavisNamespace`: holds controlled vocabulary for allowed call methods

- ``CONTIG``: a contig was assembled and aligned across the breakpoints
- ``SPLIT``: the event was called by :term:`split read`
- ``FLANK``: the event was called by :term:`flanking read pair`
- ``SPAN``: the event was called by :term:`spanning read`
"""
예제 #10
0
def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars):
    """Genotype individuals at SNPs loci.
    
    """
    win_buffer = (win_len - 1) / 2
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start  # GOOD
                clipe = contig.reads[readn].qa.qual_clipping_end  # GOOD
                clipst2 = contig.reads[readn].qa.align_clipping_start  # Added
                clipe2 = contig.reads[readn].qa.align_clipping_end  # Added
                if clipst2 > clipst:  # Added
                    clipst = clipst2  # Added
                if clipe2 < clipe2:  # Added
                    clipe = clipe2  # Added
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            positions = []
            try:
                positions = snp_dict[contig_name]
            except:
                continue
            d = {}
            for pos in positions:
                if stars == True:
                    pos_ok = correct_position(pos, sequences[0][1])
                else:
                    pos_ok = pos
                left = pos_ok - 5
                if left < 0:
                    left = 0
                right = pos_ok + 1 + 5  # takes into account the middle nucleotide
                ref_window = sequences[0][1][left:right]
                d.setdefault(pos, {})
                d[pos].setdefault("XX_noTag", {})
                for nuc in list("ACGTN*-"):
                    d[pos]["XX_noTag"].setdefault(nuc, 0)
                for tag in tags:
                    d[pos].setdefault(tag, {})
                    for nuc in list("ACGTN*-"):
                        d[pos][tag].setdefault(nuc, 0)
                for fasta in sequences:
                    window = fasta[1][left:right]
                    del_count = 0
                    if window.count("-") > win_buffer - 3:
                        continue  # Need at least 3 nucleotides on each side
                    for tag in tags:
                        if tag in fasta[0]:
                            t = tag
                            break
                        else:
                            t = "XX_noTag"
                    if len(ref_window) == len(window):
                        for i in xrange(len(window)):
                            if ref_window[i].isalpha() and window[i] == "*" or \
                               window[i].isalpha() and ref_window[i] == "*":
                                del_count += 1
                    if del_count > max_del:
                        continue
                    p = pos
                    s = fasta[1]  # Sequence
                    n = s[pos_ok - 1].upper()
                    d[p][t][n] += 1
            for p in sorted(d):
                for t in sorted(d[p]):
                    output_file.write(contig_name + "\t" + str(p) + "\t" +
                                      str(t))
                    for n in list("ACGTN*-"):
                        output_file.write("\t" + str(d[p][t][n]))
                    output_file.write("\n")
예제 #11
0
from __future__ import division, print_function

from Bio.Alphabet import Gapped
from Bio.Alphabet.IUPAC import extended_dna, extended_protein

GAPS = '_.-='

AminoAlphabet = Gapped(extended_protein)
DNAAlphabet = Gapped(extended_dna)
예제 #12
0
    inFile = open(brat_in, 'r')
    alignLength = align.get_alignment_length()
    genome = [0]*alignLength
    for index, line in enumerate(inFile):
        if index > 0:
            line = line.strip()
            wordList = line.split()
            start = int(wordList[0])
            stop = int(wordList[1])
            genome[start:stop + 1] = [x+1 for x in genome[start:stop + 1]]
    recoFreeAlign = align[:, 0:1]
    for i in range(1, len(genome)):
        if genome[i] == 0:
            recoFreeAlign = recoFreeAlign + align[:, i:i+1]
    return recoFreeAlign

# Get command line arguments
brat_in, fasta_in = get_arguments(sys.argv[1:])
if brat_in is None or fasta_in is None:
    usage()
    sys.exit(2)

# Read in BratNextGen File and FASTA alignment
align = AlignIO.read(fasta_in, "fasta", alphabet = Gapped(IUPAC.ambiguous_dna, '-'))
noRecoAlign = remove_reco(brat_in, align)


# output alignment without recombination
outName = os.path.splitext(fasta_in)[0] + "noReco.fasta"
AlignIO.write(noRecoAlign, outName, "fasta") 
예제 #13
0
    def __init__(self, reference_path, patient=None):

        self.reference_df = pd.DataFrame()

        # сначала получаем все названия файлов
        reference_list = [
            x for x in os.listdir(reference_path) if 'reference' in x
        ]

        # если нужно было выбрать одного пациента, то оставляем только соответствующие названия
        if patient:
            reference_list = [x for x in reference_list if f'_{patient}.' in x]

        # собираем данные из json-ов
        for name in reference_list:
            with open(os.path.join(reference_path, name)) as f:
                json_file = json.load(f)

                # удаляем ненужные колонки
                t = pd.DataFrame(data=json_file).drop(['name', 'description'],
                                                      axis=1)

                # делим объединённые колонки на отдельные
                t = pd.concat([
                    t.drop(['features'], axis=1),
                    t.features.apply(pd.Series)
                ],
                              axis=1)

                # переводим в простой список
                t.location = t.location.apply(pd.Series)

                # вырезаем последовательность
                t['region_seq'] = t.apply(
                    lambda x: x.seq[x.location[0]:x.location[1]].strip(),
                    axis=1)

                # переименовываем для единообразия
                t.rename(mapper={
                    'region_seq': 'sequence',
                    'seq': 'full_reference'
                },
                         axis=1,
                         inplace=True)
                t['translated'] = t.sequence.apply(lambda x: Seq(
                    x, Gapped(IUPAC.unambiguous_dna)).ungap().translate())
                self.reference_df = pd.concat([self.reference_df, t],
                                              ignore_index=True)

        # оставляем только нужные колонки
        self.reference_df = self.reference_df[[
            'sequence', 'name', 'translated', 'id'
        ]]

        # остальное дописываем вручную (это будет использоваться в дереве)
        self.reference_df['days'] = 0
        self.reference_df['frequency'] = 100
        self.reference_df['nreads'] = 1

        # если это делалось для одного пациента, то сразу отдаём объект с его регионами
        if patient:
            self.region = Region(self.reference_df)
예제 #14
0
def AceIterator(source):
    """Return SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags.

    Ace files include the base quality for each position, which are taken
    to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
    using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
    letter_annotations dictionary under the "phred_quality" key.

    >>> from Bio import SeqIO
    >>> with open("Ace/consed_sample.ace") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s %s... %i" % (record.id, record.seq[:10], len(record)))
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 agccccgggc... 1475
    90

    However, ACE files do not include a base quality for any gaps in the
    consensus sequence, and these are represented in Biopython with a quality
    of zero. Using zero is perhaps misleading as there may be very strong
    evidence to support the gap in the consensus. Previous versions of
    Biopython therefore used None instead, but this complicated usage, and
    prevented output of the gapped sequence as FASTQ format.

    >>> from Bio import SeqIO
    >>> with open("Ace/contig1.ace") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s ...%s..." % (record.id, record.seq[85:95]))
    ...         print(record.letter_annotations["phred_quality"][85:95])
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 ...AGAGG-ATGC...
    [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
    90
    Contig2 ...GAATTACTAT...
    [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
    90

    """
    for ace_contig in Ace.parse(source):
        # Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        # Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str:
            if "T" in consensus_seq_str:
                # Very odd! Error?
                alpha = generic_nucleotide
            else:
                alpha = generic_rna
        else:
            alpha = generic_dna

        if "*" in consensus_seq_str:
            # For consistency with most other file formats, map
            # any * gaps into - gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*", "-"),
                                Gapped(alpha, gap_char="-"))
        else:
            consensus_seq = Seq(consensus_seq_str, alpha)

        # TODO? - Base segments (BS lines) which indicates which read
        # phrap has chosen to be the consensus at a particular position.
        # Perhaps as SeqFeature objects?

        # TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        # Perhaps as SeqFeature objects?

        seq_record = SeqRecord(consensus_seq,
                               id=ace_contig.name,
                               name=ace_contig.name)

        # Consensus base quality (BQ lines).  Note that any gaps (originally
        # as * characters) in the consensus do not get a quality entry, so
        # we assign a quality of None (zero would be misleading as there may
        # be excellent support for having a gap here).
        quals = []
        i = 0
        for base in consensus_seq:
            if base == "-":
                quals.append(0)
            else:
                quals.append(ace_contig.quality[i])
                i += 1
        assert i == len(ace_contig.quality)
        seq_record.letter_annotations["phred_quality"] = quals

        yield seq_record
예제 #15
0
def pairwise(in_ace, out_file):
    """Calculate pairwise differentiation indexes.
    
    """
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            window_len = 8  # PARAMETER
            max_diff = 3  # PARAMETER
            len_contig = len(sequences[0][1])
            number_indexes = 0
            total_indexes = 0
            for seq in sequences[1:]:
                try:
                    start = len(re.findall("^-+", seq[1])[0])
                except:
                    start = 0
                len_seq = 0
                min_len_seq = 100  # PARAMETER
                count = 0
                for window in range(start, len_contig, window_len):
                    nuc_contig = sequences[0][1][window:window + window_len]
                    nuc_seq = seq[1][window:window + window_len]
                    if "-" in nuc_seq:
                        len_seq += len(nuc_seq.replace("-", ""))
                    else:
                        diff = count_diff(nuc_contig, nuc_seq, max_diff)
                        if diff[1] == False:
                            count += diff[0]
                            len_seq += window_len
                len_seq -= seq.count("*")
                if len_seq >= min_len_seq:
                    index = float(count) / len_seq
                    if count > 0:
                        number_indexes += 1
                        total_indexes += index
                else:
                    index = "NA"
                #output_file.write(contig_name + "\t" + str(index) + "\n")
            try:
                mean_index = float(total_indexes) / number_indexes
            except:
                mean_index = "NA"
            output_file.write(contig_name + "\t" + str(mean_index) + "\n")
예제 #16
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" %
                             (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect", None)
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                #Quick hack until I can work out how -, * and / characters
                #and the apparent mix of aa and bp coordinates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError as err:
            print("Darn... amino acids vs nucleotide coordinates?")
            print(tool)
            print(query_seq)
            print(query_tags)
            print("%s %i" % (q, len(q)))
            print(match_seq)
            print(match_tags)
            print("%s %i" % (m, len(m)))
            print(handle.name)
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.iteritems():
            alignment._annotations[key] = value
        for key, value in align_tags.iteritems():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
예제 #17
0
파일: core.py 프로젝트: PielLab/transator
    def run(self, consensusThreshold):
        from Bio import AlignIO, SeqIO
        from Bio.Align import AlignInfo
        # from Bio.Align import MultipleSeqAlignment
        from Bio.Alphabet import IUPAC, Gapped
        # from Bio.Seq import Seq
        # from Bio.SeqRecord import SeqRecord
        # Directory where files are
        # os.chdir(sys.argv[1])
        # listing = os.listdir(".")
        listing = os.listdir(self.pathToCladesAlignments)
        consensus = {}
        genConsensus = ''
        pssmGen = ''
        # this value should be read from the arguments or else use a default
        consensusThres = consensusThreshold
        # sys.argv[2] holds the path to the general alignment
        generalAlignment = AlignIO.parse(self.generalAlignment,
                                         "fasta",
                                         alphabet=Gapped(
                                             IUPAC.ExtendedIUPACProtein(),
                                             "-"))
        lengthGenAl = 0
        positionsToMask = []
        for genAlignment in generalAlignment:
            sumGen = AlignInfo.SummaryInfo(genAlignment)
            genConsensus = sumGen.gap_consensus(consensusThres)
            for index, residue in enumerate(genConsensus):
                if genConsensus[index] == '-':
                    continue
                if genConsensus[index] == 'X':
                    continue
                positionsToMask.append(index)
            #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-'])
            pssmGen = sumGen.pos_specific_score_matrix(genConsensus)
            lengthGenAl = len(genAlignment)

        print positionsToMask
        print listing

        resultAlignFiles = []
        for item in listing:
            if item.endswith(".fas"):
                #alignments = AlignIO.parse(item,"fasta",alphabet=IUPAC.ExtendedIUPACProtein())
                alignments = AlignIO.parse(self.pathToCladesAlignments + item,
                                           "fasta",
                                           alphabet=Gapped(
                                               IUPAC.ExtendedIUPACProtein(),
                                               "-"))
                for alignment in alignments:
                    summ = AlignInfo.SummaryInfo(alignment)
                    consensus[item] = summ.gap_consensus(consensusThres)
                    for posToMask in positionsToMask:
                        if consensus[item][posToMask] == '-':
                            continue
                        for alignElement in alignment:
                            mutSeq = alignElement.seq.tomutable()
                            mutSeq[posToMask] = 'X'
                            alignElement.seq = mutSeq.toseq()
                    SeqIO.write(
                        alignment, self.outPutPath + item +
                        "_noPKSsignal_Thres%d.faa" % (consensusThres * 100, ),
                        "fasta")
                    resultAlignFiles.append(self.outPutPath + item +
                                            "_noPKSsignal_Thres%d.faa" %
                                            (consensusThres * 100, ))
                    summ = AlignInfo.SummaryInfo(alignment)
                    consensus[item] = summ.gap_consensus(consensusThres)
                    print item, consensus[item]
        return resultAlignFiles
예제 #18
0
def replace_outgroup_with_gap(seq_directory, outgroup_path, window_size = 20, Max_p_sites_o = 8):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H",
                   "D", "B"]

    ### input directory from s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory_1 = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    output_directory_2 = output_directory_1.replace("/s1_rm_polymorphism_sites/","/s2_rm_polymorphism_in_outgroups/")

    if os.path.isdir(output_directory_2) == False:
        os.makedirs(output_directory_2)

    ### iterate each gene
    for file in os.listdir(output_directory_1):
        if file != ".DS_Store":
            output_directory_file = output_directory_2 + file
            fasta_name = output_directory_1 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " + sequence)

                alignment = AlignIO.read(sequence, 'fasta')

                ### calculate the polymorphism in outgroup
                ### change alignment to an array.
                total_wrong_poly_sites_outgroup = []

                align_array_outgroup = np.array([list(rec) for rec in alignment])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = alignment.get_alignment_length()
                # alignment = AlignIO.read(sequence, 'fasta')
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position_outgroup = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array_outgroup[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position_outgroup.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position_outgroup.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 8 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position_outgroup) > float(Max_p_sites_o):
                        print(column_position_outgroup)
                        total_wrong_poly_sites_outgroup = total_wrong_poly_sites_outgroup + column_position_outgroup


                unique_wrong_sites_ougroup = list(np.unique(total_wrong_poly_sites_outgroup))
                print(unique_wrong_sites_ougroup)
                print("outgroup")


                align_2 = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
                for record in alignment:
                    new_seq = ""

                    if record.id in outgroups:
                        print(record.seq)
                        for i in range(total_length):
                            if i in unique_wrong_sites_ougroup:
                                new_seq = new_seq + "-"
                            else:
                                new_seq = new_seq + str(record.seq[i])

                        align_2.add_sequence(str(record.id), str(new_seq))

                    else:
                        align_2.add_sequence(str(record.id), str(record.seq))

                print(align_2)

                AlignIO.write(align_2, output_directory_file, "fasta")
예제 #19
0
# University of Florida

parser = argparse.ArgumentParser()
parser.add_argument("-i", help="input Phylip formatted file")
parser.add_argument("-o", help="output filename")
parser.add_argument("-a", help="Alphabet: dna or aa, default=dna", default="dna")

args = parser.parse_args()

infile = args.i
outfile = args.o
alphabet = args.a

try:
	IN=open(infile, 'r')
except IOError:
	print "Can't open file", infile

try:
	OUT=open(outfile, 'a')
except IOError:
		print "Can't open file", outfile

if alphabet == "dna":		
	alignment = AlignIO.read(IN, "phylip-relaxed", alphabet=Gapped(IUPAC.ambiguous_dna))
	AlignIO.write([alignment], OUT, "nexus")

elif alphabet == "aa":		
	alignment = AlignIO.read(IN, "phylip-relaxed", alphabet=Gapped(IUPAC.protein))
	AlignIO.write([alignment], OUT, "nexus")
예제 #20
0
#!/usr/bin/env python
"""Example of generating a substitution matrix from an alignment.
"""
# standard library
from __future__ import print_function

# Biopython
from Bio import SubsMat
from Bio import AlignIO
from Bio.Alphabet import IUPAC, Gapped
from Bio.Align import AlignInfo

# get an alignment object from a Clustalw alignment output
c_align = AlignIO.read('protein.aln',
                       'clustal',
                       alphabet=Gapped(IUPAC.protein))
summary_align = AlignInfo.SummaryInfo(c_align)

# get a replacement dictionary and accepted replacement matrix
# exclude all amino acids that aren't charged polar
replace_info = summary_align.replacement_dictionary([
    "G", "A", "V", "L", "I", "M", "P", "F", "W", "S", "T", "N", "Q", "Y", "C"
])

my_arm = SubsMat.SeqMat(replace_info)

print(replace_info)

my_lom = SubsMat.make_log_odds_matrix(my_arm)

print('log_odds_mat: %s' % my_lom)
예제 #21
0
def write_fasta(chromosome, RGID, refID):
    """Writes a RGA fasta alignment for each vcf"""
    outFile = RGID + "_RGA_pilon.fasta"
    Sample = "pilon_" + RGID
    record = SeqRecord(Seq("".join(chromosome), Gapped(IUPAC.ambiguous_dna, '-')), id=Sample, description = "RGA_to_" + refID)
    SeqIO.write(record, outFile, "fasta")