Beispiel #1
0
def main():
    print('##############################################################')
    print(
        'This script must be in a directory with the folder called "Alignment" and the fastas you want to concatenate.'
    )
    print('The fastas must have the extension ".fasta" or it wont work.')
    print(
        'The output files are the concatenated seqs as a fasta and as a phylip for raxml-ing and a logfile.'
    )
    print(
        'The log file tells the length of the added sequence and the order in which they are added, so you can figure out your breakpoints. '
    )
    print('##############################################################')

    path2alignments = (input("path to the purged alignments: ")).strip() + "/"
    outfile = open('concat_alignment.fasta', 'w')
    newAlignment = Alignment('')
    #	newAlignment = ''

    for f in os.listdir(path2alignments):
        #		if re.search('.fas',f):
        if f.endswith('.fas'):
            tempAlignment = concat(newAlignment, path2alignments + f, 'LG')
            newAlignment = tempAlignment

    newAlignment.write('concat')
    time.sleep(20)
    makephy('concat_alignment.fasta')
Beispiel #2
0
def concat(newAlignment, f, model):
    print f
    #inSeq2 = SeqIO.parse(open('concatenated.fas','r'),'fasta')

    a = Alignment(f)  #make an alignment instance from the file f
    a.setmodel(model)  #set the model for the alignment
    a.populate()

    fixFastas(newAlignment, a)
    tempAlignment = Alignment('')

    writelog(f + ':' + str(a.numchar) + ' ' + model)
    for seq in newAlignment.seqs:
        flag = 0
        for aseq in a.seqs:

            if seq.id == aseq.id:
                flag = 1
                newseq = SeqRecord(seq=seq.seq + aseq.seq, id=seq.id)

                tempAlignment.seqs.append(newseq)
                tempAlignment.idlist.append(newseq.id)

    tempAlignment.numchar = len(tempAlignment.seqs[0].seq)
    tempAlignment.numseq = len(tempAlignment.seqs)
    #print tempAlignment.numchar
    #print tempAlignment.numseq

    return tempAlignment
Beispiel #3
0
 def open_alignment(self, event):  # wxGlade: MeerK40t.<event_handler>
     project.close_old_window("alignment")
     from Alignment import Alignment
     window = Alignment(None, wx.ID_ANY, "")
     window.set_project(project)
     window.Show()
     project.windows["alignment"] = window
    def needleman_wunsch_matrix(self,seq1, seq2):
        """
        fill in the DP matrix according to the Needleman-Wunsch algorithm.
        Returns the matrix of scores and the matrix of pointers
        """
     
        indel = -1 # indel penalty
     
        n = len(seq1)
        m = len(seq2)

        s = np.zeros( (n+1, m+1) ) # DP matrix
        ptr = np.zeros( (n+1, m+1), dtype=int  ) # matrix of pointers
     
        ##### INITIALIZE SCORING MATRIX (base case) #####
     
        for i in range(1, n+1) :
            s[i,0] = indel * i
        for j in range(1, m+1):
            s[0,j] = indel * j
     
        ########## INITIALIZE TRACEBACK MATRIX ##########
     
        # Tag first row by LEFT, indicating initial "-"s
        ptr[0,1:] = self.LEFT
     
        # Tag first column by UP, indicating initial "-"s
        ptr[1:,0] = self.UP
     
        #####################################################
     
        for i in range(1,n+1):
            for j in range(1,m+1): 
                # match
                simpleAlign=Alignment()
                simpleAlign.needleman_wunsch(seq1[i-1], seq2[j-1])
                score=simpleAlign.score
                s[i,j] = s[i-1,j-1]+ score
                

                # indel penalty
                if s[i-1,j] + indel > s[i,j] :
                    s[i,j] = s[i-1,j] + indel
                    ptr[i,j] = self.UP
                # indel penalty
                if s[i, j-1] + indel > s[i,j]:
                    s[i,j] = s[i, j-1] + indel
                    ptr[i,j] = self.LEFT
     
        return s, ptr
Beispiel #5
0
    def needleman_wunsch_matrix(self, seq1, seq2):
        """
        fill in the DP matrix according to the Needleman-Wunsch algorithm.
        Returns the matrix of scores and the matrix of pointers
        """

        indel = -1  # indel penalty

        n = len(seq1)
        m = len(seq2)

        s = np.zeros((n + 1, m + 1))  # DP matrix
        ptr = np.zeros((n + 1, m + 1), dtype=int)  # matrix of pointers

        ##### INITIALIZE SCORING MATRIX (base case) #####

        for i in range(1, n + 1):
            s[i, 0] = indel * i
        for j in range(1, m + 1):
            s[0, j] = indel * j

        ########## INITIALIZE TRACEBACK MATRIX ##########

        # Tag first row by LEFT, indicating initial "-"s
        ptr[0, 1:] = self.LEFT

        # Tag first column by UP, indicating initial "-"s
        ptr[1:, 0] = self.UP

        #####################################################

        for i in range(1, n + 1):
            for j in range(1, m + 1):
                # match
                simpleAlign = Alignment()
                simpleAlign.needleman_wunsch(seq1[i - 1], seq2[j - 1])
                score = simpleAlign.score
                s[i, j] = s[i - 1, j - 1] + score

                # indel penalty
                if s[i - 1, j] + indel > s[i, j]:
                    s[i, j] = s[i - 1, j] + indel
                    ptr[i, j] = self.UP
                # indel penalty
                if s[i, j - 1] + indel > s[i, j]:
                    s[i, j] = s[i, j - 1] + indel
                    ptr[i, j] = self.LEFT

        return s, ptr
Beispiel #6
0
def gen_alignment_list(msa_xml_file):

    result = open(msa_xml_file, "r")
    records = NCBIXML.parse(result)
    alignment_obj_list = []
    item = next(records)
    for alignment in tqdm(item.alignments, desc="Parsing MSA XML"):
        for hsp in alignment.hsps:
            hit_id = alignment.accession[0:4]
            chain_id = alignment.accession[5]
            query_range = (hsp.query_start, hsp.query_end)
            hit_range = (hsp.sbjct_start, hsp.sbjct_end)
            query_seq = hsp.query
            hit_seq = hsp.sbjct
            midline = hsp.match

            # dont want native
            if hit_range == query_range:
                break

            alignment_obj = Alignment(hit_id, chain_id, query_range, hit_range,
                                      query_seq, hit_seq, midline)
            alignment_obj_list.append(alignment_obj)

    return alignment_obj_list
Beispiel #7
0
def command_setup(args):
    print "Running GraphMatch...\n"
    print "Parameters chosen: "
    print "Query Graph: %s\nReference Graph: %s\nCorrespondences File: %s" % (args.q, args.i, args.c)
    print "Number of indels allowed: %d\nIndel Penalty: %.2f\nNon-associated query vertex penalty: %.2f\nNumber of top results: %d" % (args.m, args.ip, args.np, args.k)
    if args.s:
        print "Scoring Method: negative log(E-value)"
    else:
        print "Scoring Method: Raw match score"
        print "WARNING: due to scores being positive or negative for this approach, no branch and bound available."
    print
    #Setting up the parameters
    global m
    global indel_penalty
    global non_assoc_vertex_penalty
    global score_method
    global top_alignments
    m = args.m
    indel_penalty = args.ip
    non_assoc_vertex_penalty = args.np
    score_method = args.s
    #Instantiate the top alignments with low scoring Alignments (So they will get replaced)
    for i in range(0, args.k):
        a = Alignment(float("-inf"), [], [], [], [])
        top_alignments.append(a)
Beispiel #8
0
 def realign(self, outbase):
     """Calls the Alignment class, which calls Bowtie2 to realign reads"""
     input_fq = outbase + "_input-homolog.fq"
     output_sam = outbase + "_input-homolog_realigned.sam"
     self.l.log("Aligning " + input_fq + " to the genome (output at " +
                output_sam + ")...")
     Alignment(self.args, input_fq, output_sam)
    def __init__(self):
        super(Tools, self).__init__()

        vboxIndex = gtk.VBox(False, 5)
        aIndex = gtk.Alignment(0.5, 0.25, 0, 0)
        aIndex.add(Index())

        vboxAlign = gtk.VBox(False, 5)
        aAlign = gtk.Alignment(0.5, 0.25, 0, 0)
        aAlign.add(Alignment())

        #Viewer notebook
        vboxViewer = gtk.VBox(False, 5)
        aViewer = gtk.Alignment(0.5, 0.25, 0, 0)
        aViewer.add(Viewer())

        vboxIndex.pack_start(aIndex)
        vboxAlign.pack_start(aAlign)
        vboxViewer.pack_start(aViewer)

        self.set_tab_pos(gtk.POS_TOP)
        self.append_page(vboxIndex)
        self.set_tab_label_text(vboxIndex, config.LOC["tools_index"])
        self.append_page(vboxAlign)
        self.set_tab_label_text(vboxAlign, config.LOC["tools_align"])
        self.append_page(vboxViewer)
        self.set_tab_label_text(vboxViewer, config.LOC["tools_viewer"])
Beispiel #10
0
    def alignment(self):
        """Make self into an alignment, and return it.

        If all the sequences are the same length and type, then self,
        a sequenceList, could be an Alignment.  This method generates
        an Alignment instance, runs the Alignment method
        checkLengthsAndTypes(), and returns the Alignment.

        If you feed p4 a fasta sequence, it makes SequenceList object,
        and runs this method on it.  If it works then p4 puts the
        Alignment object in var.alignments, and if not it puts the
        SequenceList object in var.sequenceLists.

        It is possible that p4 might think that some short sequences
        are DNA when they are really protein.  In that case it will
        fail to make an alignment, because it will fail the types
        check.  So what you can do is something like this::

            sl = var.sequenceLists[0]
            for s in sl.sequences:
                s.dataType = 'protein'
            a = sl.alignment()

        """

        from Alignment import Alignment
        a = Alignment()
        a.fName = self.fName
        import copy
        a.sequences = copy.deepcopy(self.sequences)  # self will be deleted
        a.fName = self.fName
        a.checkLengthsAndTypes()
        return a
Beispiel #11
0
def extract_bisents(file1, lang1, file2, lang2, alignment_file):
    assert isinstance(lang1, (unicode, str)) and len(lang1)==2
    assert isinstance(lang2, (unicode, str)) and len(lang2)==2
    t1 = Text.from_file(file1, lang1)
    t2 = Text.from_file(file2, lang2)
    alignment = Alignment.from_file(alignment_file)
    bisents = alignment.as_pairs(t1.as_sentences_flat(),
                                 t2.as_sentences_flat())
    return bisents
Beispiel #12
0
    def get_alignment(self, langs, backend=None):
        """like fetcher"""
        assert len(langs) >= 2
        assert not backend or backend in possible_backends

        real_langs = list(set(lang[:2]
                              for lang in langs))

        if len(real_langs) == 1:
            text_len = len(fetch_sentences(basename, real_langs[0]))
            return Alignment.create_straight(text_len, len(langs))
        elif len(real_langs) == 2:
            a = None
            for i in range(2):
                for b in ([backend] if backend else possible_backends):
                    try:
                        langs_string = '-'.join(str(l) for l in real_langs)
                        a = Alignment.from_file(self._p(langs_string + '.' + b))
                        break
                    except IOError:
                        continue
                if a:
                    break
                real_langs.reverse()
            if not a:
                raise IOError

        else: # len(real_langs) == 3 :(
            a1 = self.get_alignment(['pl', 'cu'], backend).as_ladder()
            a2 = self.get_alignment(['cu', 'el'], backend).as_ladder()
            a3 = self.get_alignment(['pl', 'el'], backend).as_ladder()
#            a3 = [(b, a) for (a, b) in a3] # reversed
            a = merge_3_alignments(a1, a2, a3)
            real_langs = ['pl', 'cu', 'el'] # needed later

        columns = _transpose(a.data)
        columns_map = { real_langs[i] : columns[i]
                        for i in range(len(real_langs)) }

        # common part for 2 and 3
        chosen_columns = [columns_map[lang[:2]] for lang in langs]
        chosen_columns.append(columns[2])
        return Alignment(_transpose(chosen_columns))
Beispiel #13
0
    def calcUnconstrainedLogLikelihood1(self):
        """Calculate likelihood under the multinomial model.

        This calculates the unconstrained (multinomial) log like
        without regard to character partitions.  The result is placed
        in the data variable unconstrainedLogLikelihood.  If there is
        more than one partition, it makes a new temporary alignment
        and puts all the sequences in one part in that alignment.  So
        it ultimately only works on one data partition.  If there is
        more than one alignment, there is possibly more than one
        datatype, and so this method will refuse to do it.  Note that
        the unconstrained log like of the combined data is not the sum
        of the unconstrained log likes of the separate partitions.

        See also calcUnconstrainedLogLikelihood2

        """

        if len(self.alignments) > 1:
            gm = ["Data.calcUnconstrainedLogLikelihood()"]
            gm.append("This method is not implemented for more than one alignment.")
            raise Glitch, gm
        if self.nParts == 1:  # no problem
            self.unconstrainedLogLikelihood = pf.getUnconstrainedLogLike(self.parts[0].cPart)
        else:
            a = self.alignments[0]
            import copy
            newAlig = Alignment()
            newAlig.dataType = a.dataType
            newAlig.symbols = a.symbols
            newAlig.dim = a.dim
            newAlig.equates = a.equates
            newAlig.taxNames = a.taxNames
            for s in a.sequences:
                newAlig.sequences.append(copy.deepcopy(s))
            newAlig.checkLengthsAndTypes()
            newAlig._initParts()
            #newAlig.dump()
            self.unconstrainedLogLikelihood = pf.getUnconstrainedLogLike(newAlig.parts[0].cPart)
            del(newAlig)
Beispiel #14
0
def fetch_alignment(basename, langs, backend='hunalign'):
    assert langs
    real_langs = list(set(lang[:2]
                          for lang in langs))

    if len(real_langs) == 1:
        text_len = len(fetch_sentences(basename, real_langs[0]))
        return Alignment.create_straight(text_len, len(langs))
    elif len(real_langs) == 2:
        try:
            a = Alignment.from_file("%s/%s-%s.%s" %
                                    (basename, real_langs[0], real_langs[1], backend))
        except IOError:
            real_langs.reverse()
            a = Alignment.from_file("%s/%s-%s.%s" %
                                    (basename, real_langs[0], real_langs[1], backend))

    else: # len(real_langs) == 3 :(
        a1 = Alignment.from_file('%s/pl-cu.%s' % (basename, backend)).as_ladder()
        a2 = Alignment.from_file('%s/cu-el.%s' % (basename, backend)).as_ladder()
        a3 = Alignment.from_file('%s/pl-el.%s' % (basename, backend)).as_ladder()
        a3 = [(b, a) for (a, b) in a3] # reversed
        a = merge_3_alignments(a1, a2, a3)
        real_langs = ['pl', 'cu', 'el'] # needed later

    columns = _transpose(a.data)
    columns_map = { real_langs[i] : columns[i]
                    for i in range(len(real_langs)) }

    # common part for 2 and 3
    chosen_columns = [columns_map[lang[:2]] for lang in langs]
    chosen_columns.append(columns[2])
    return Alignment(_transpose(chosen_columns))
Beispiel #15
0
 def extend_hit(self,hits):
     alignments = []
     idx = 0
     db_length = self.seq_pos[-1]
     #print(db_length)
     for hit in hits:
         name = self.seq_names[bisect_right(self.seq_pos,hit[0][1])-1]
         species = self.species_names[bisect_right(self.species_idx,hit[0][1])-1]
         score, align_s, align_q = self.extend_head(self.query[:hit[0][0]], hit[0][1])
         #print(idx, score, align_s, align_q)
         for i in range(len(hit)-1):
             tmp_s = list(self.extract_seq(hit[i][1], hit[i][2]))
             tmp_q = list(self.query[hit[i][0]:hit[i][0]+hit[i][2]])
             tmp_score = 0
             for j in range(hit[i][2]):
                 tmp_score += self.matrix[tmp_s[j]+tmp_q[j]]
             #print(idx, tmp_score, tmp_s, tmp_q)
             align_s += tmp_s
             align_q += tmp_q
             score += tmp_score
             
             tmp_s = self.extract_seq(hit[i][1]+hit[i][2], hit[i+1][1]-hit[i][1]-hit[i][2])
             tmp_q = self.query[hit[i][0]+hit[i][2]:hit[i+1][0]]
             tmp_score, tmp_q, tmp_s = self.needle(tmp_s,tmp_q)
             #print(idx, tmp_score, tmp_s, tmp_q)
             align_s += tmp_s
             align_q += tmp_q
             score += tmp_score
         tmp_s = list(self.extract_seq(hit[-1][1], hit[-1][2]))
         tmp_q = list(self.query[hit[-1][0]:hit[-1][0]+hit[-1][2]])
         tmp_score = 0
         for j in range(hit[-1][2]):
             tmp_score += self.matrix[tmp_s[j]+tmp_q[j]]
         align_s += tmp_s
         align_q += tmp_q
         score += tmp_score
         tmp_score, tmp_s, tmp_q = self.extend_tail(self.query[hit[-1][0]+hit[-1][2]:], hit[-1][1]+hit[-1][2])
         align_s += tmp_s
         align_q += tmp_q
         score += tmp_score
         
         idx += 1
         alignment = Alignment(species, name, score, align_q, align_s, self.matrix, len(self.query), db_length)
         if alignment.E < self.evalue_limit:
             alignments.append(alignment)
             
     alignments.sort(key=lambda alignments: alignments.score, reverse=True)
     alignments = alignments[:self.LIM]
     return alignments
Beispiel #16
0
def ScoreAlignment(W_prime, G, G0):
    score = 0
    assoc_vertices = []
    indel_vertices = []
    query_edges = []
    ref_edges = []
    if score_method: #Use -log(corresp score) as match value. want to maximize this score.
        for node in G0.node:
            corresp = getCorrespondingVertex(node, W_prime)
            if corresp is None:
                score = score - non_assoc_vertex_penalty
            else:
                assoc_vertices.append(node)
                if corresp.score <= 1e-200:
                    score += -1.0*log(1e-200)
                else:
                    score += -1.0*log(corresp.score)

    else: #Using raw correspondance score for matching -- want to maximize this score.
        for node in G0.node:
            corresp = getCorrespondingVertex(node, W_prime)
            if corresp is None:
                score = score - non_assoc_vertex_penalty
            else:
                assoc_vertices.append(node)
                score -= corresp.score

    #Indel penalties
    for i in range(0, len(assoc_vertices)):
        for j in range(i+1, len(assoc_vertices)):
            vi = assoc_vertices[i]
            vk = assoc_vertices[j]
            if G0.has_edge(vi, vk):
                query_edges.append((vi, vk))
                vij = getCorrespondingVertex(vi, W_prime)
                vkl = getCorrespondingVertex(vk, W_prime)
                path_G = nx.shortest_path(G, vij.name, vkl.name)
                pathLen = len(path_G) - 2
                for h in range(0, pathLen):
                    indel_vertices.append(path_G[h+1])
                score = score - pathLen*indel_penalty
                ref_edges.append(path_G)

    return Alignment(score, copy.copy(W_prime), indel_vertices, query_edges, ref_edges)
Beispiel #17
0
    def run(self):
        number_of_dirs = len(self.img_dir_names)
        count = 0
        for dir_name in self.img_dir_names:
            img_file_names = File.get_img_file_names(
                os.path.join(self.img_path, dir_name))

            for file_name in img_file_names:
                LandmarksDetector(
                    land_path=LAND_PATH,
                    img_path=os.path.join(IMG_PATH, dir_name),
                    face_pose_predictor=self.face_pose_predictor,
                    face_detector=self.face_detector,
                    file_name=file_name,
                ).run(True)
                Alignment(land_path=LAND_PATH,
                          img_path=os.path.join(IMG_PATH, dir_name),
                          deformed_img_path=os.path.join(
                              DEFORMED_IMG_PATH, dir_name),
                          ideal_mask=self.ideal_mask,
                          file_name=file_name).run()
            print('{}/{}'.format(count, number_of_dirs))
            count += 1
            print()
Beispiel #18
0
 def __init__(self,readname,chr,start,end,strand,score,readcount,readsequence,cigar,qualstring):
     Alignment.__init__(self,readname,chr,start,end,strand,score=readcount,readcount = readcount,readsequence=readsequence)
     self.qual = qualstring
     self.cigar = cigar
Beispiel #19
0
from Alignment import Alignment

print Alignment("GCTGATATAGCT", "GGGTGATTAGCT")
a = Alignment("GCTGATATAGCT", "GGGTGATTAGCT")
print a.getDistance()
Beispiel #20
0
    def calcUnconstrainedLogLikelihood1(self):
        """Calculate likelihood under the multinomial model.

        This calculates the unconstrained (multinomial) log like
        without regard to character partitions.  The result is placed
        in the data variable unconstrainedLogLikelihood.  If there is
        more than one partition, it makes a new temporary alignment
        and puts all the sequences in one part in that alignment.  So
        it ultimately only works on one data partition.  If there is
        more than one alignment, there is possibly more than one
        datatype, and so this method will refuse to do it.  Note that
        the unconstrained log like of the combined data is not the sum
        of the unconstrained log likes of the separate partitions.

        See also calcUnconstrainedLogLikelihood2

        """

        if len(self.alignments) > 1:
            gm = ["Data.calcUnconstrainedLogLikelihood()"]
            gm.append(
                "This method is not implemented for more than one alignment.")
            raise Glitch, gm
        if self.nParts == 1:  # no problem
            self.unconstrainedLogLikelihood = pf.getUnconstrainedLogLike(
                self.parts[0].cPart)
        else:
            a = self.alignments[0]
            import copy
            newAlig = Alignment()
            newAlig.dataType = a.dataType
            newAlig.symbols = a.symbols
            newAlig.dim = a.dim
            newAlig.equates = a.equates
            newAlig.taxNames = a.taxNames
            for s in a.sequences:
                newAlig.sequences.append(copy.deepcopy(s))
            newAlig.checkLengthsAndTypes()
            newAlig._initParts()
            #newAlig.dump()
            self.unconstrainedLogLikelihood = pf.getUnconstrainedLogLike(
                newAlig.parts[0].cPart)
            del (newAlig)
Beispiel #21
0
    def getTemplates(self):

        if self.debug:
            print(self.seq)

        result_handle = NCBIWWW.qblast("blastp",
                                       "pdb",
                                       str(self.seq),
                                       expect=0.01)
        blast_records = NCBIXML.parse(result_handle)
        if self.debug:
            print("BLAST Request Finished")
            print()

        for record in blast_records:
            for alignment in record.alignments:
                id = alignment.accession
                fasta = self.getFastaFromId(id)
                title = alignment.title
                length = alignment.length

                template = Template(id=id,
                                    fasta=fasta,
                                    sequence=title,
                                    length=length,
                                    alignments=[])

                self.templates[id] = template
                self.fastas[id] = fasta
                for hsp in alignment.hsps:

                    a = Alignment(id=id,
                                  title=title,
                                  expect=hsp.expect,
                                  score=hsp.score,
                                  identities=hsp.identities,
                                  similarity=(100 * hsp.identities /
                                              len(self.seq)),
                                  target=hsp.query,
                                  targetstart=hsp.query_start,
                                  match=hsp.match,
                                  template=hsp.sbjct,
                                  templatestart=hsp.sbjct_start,
                                  length=length)

                    targetfront = str(self.seq[:a.targetstart - 1])
                    targetend = str(self.seq[(a.targetstart + a.length):])
                    a.target = ''.join(targetfront) + a.target + ''.join(
                        targetend)
                    a.length = len(a.target)

                    templatefront = ['-'] * (a.targetstart - 1)
                    templateend = ['-'] * (len(self.seq) -
                                           (a.targetstart + a.length))
                    a.template = ''.join(templatefront) + a.template + ''.join(
                        templateend)

                    self.templates[id].alignments.append(a)
                    self.alignments.append(a)

        for id, fasta in self.fastas.items():
            fname = '%s/%s.fasta' % (self.fastasfolder, id)
            if not os.path.exists(fname):
                f = open(fname, 'w')
                SeqIO.write(fasta, f, 'fasta')
                f.close()

        for i, a in enumerate(self.alignments):
            fname = '%s/%s-%s.alignment' % (self.alignmentsfolder, a.id,
                                            str(i))
            if not os.path.exists(fname):
                f = open(fname, 'w')
                json.dump(a.toJSON(), f)
                f.close()

        return self.templates.keys()
Beispiel #22
0
          for linear gap cost use -l or -linear\n\
          for affine gap cost use -a or -affine\n\
      - b, a: parameters for gap cost function\n\
          b -> constant gap cost or slope when performing linear/affine gap constant (extension penalty)\n\
          a -> instersect for affine gap cost (opening gap penalty)\n\
      - -o: output alignment. if missing then outputs optimal score")

arguments = GetArguments(sys.argv)      # parsing arguments
# print(arguments.seq2)
# print(arguments.score_matrix)

sequences = [arguments.seq1, arguments.seq2]

substitution_matrix = arguments.score_matrix
gap_params = arguments.gap_params
alignmentType = arguments.alignment_type

my_alignment = Alignment(sequences, substitution_matrix, alignmentType, gap_params)
my_alignment.align()
if arguments.output:
    print(">seq1")
    print(my_alignment.a)
    print(">seq2")
    print(my_alignment.b)
else:
    print(my_alignment.score)

# # Score matrix print
# for row in range(len(T)):
#     print(T[row])
Beispiel #23
0
#!/usr/bin/env python3
# Usage: ./msa_exact.py score_matrix 5 test.fa
# Requires Alignment.py in the directory

from Alignment import Alignment
from Alignment import GetArguments
import sys

arguments = GetArguments(sys.argv)
test = Alignment(arguments.seqs, arguments.score_matrix, arguments.gapcost)
score = test.sp_exact_3()
alignm = test.backtrack_msa_exact()

for i in range(len(alignm)):
    print(">", arguments.heads[i])
    print(test.num_to_sequence(alignm[i]), "\n")
Beispiel #24
0
def main(argv):
    print "AnnotateTreeCmd v1.0"
    if len(argv) == 2 and argv[1] == '-t':
        conduct_tests()
        exit(0)
    elif len(argv) != 7:
            print 'usage python AnnotateTreeCmd.py seqnumfile seqfile treefile cdrfile tag wd.'
            sys.exit(0)

    for file in argv[1:4]:
        check_file(file)
        
    (seqnumfile, seqfile, treefile, cdrfile, tag, wdir) = argv[1:7]

    if len(cdrfile) > 0:
        check_file(cdrfile)
    else:
        cdrfile = None

    try:
        if not os.path.exists(wdir):
            os.makedirs(wdir)
    except:
        print "Error creating directory %s." % wdir
        sys.exit(0)
        
    try:
        msa = Alignment()
        msa.read_nt(seqfile)    # Check that the sequence comprises a valid set of codons
        for seq in msa:
            if '*' in seq:
                print "Stop codon found in sequence %s." % seq.id
                sys.exit(0)
    except:
        print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1])
        sys.exit(0)
        
    try:
        seq_pos = msa.read_position_numbers(seqnumfile)
    except:
            print "Error parsing %s: %s." % (seqnumfile, sys.exc_info()[1])
            sys.exit(0)        

    if cdrfile is not None:
        try:
            acdr = AnalyseCDR(msa, file_name=cdrfile)
        except:
            print "Error parsing %s: %s." % (cdrfile, sys.exc_info()[1])
            sys.exit(0)

    try:
        seq_align = AlignIO.read(seqfile, "fasta")
    except:
        try:
            seq_align = AlignIO.read(seqfile, "phylip")
        except:
            print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1])
            sys.exit(0)

    try:
        tree = Phylo.read(treefile, "newick")
    except:
        print "Error parsing %s: %s." % (treefile, sys.exc_info()[1])
        sys.exit(0)

    dnaml = Dnaml()
    
    int_aas = dnaml.run_dnaml(seq_align, tree, seq_pos, cdrfile, wdir, report, tag)

    if int_aas is not None:
        try:
            if cdrfile is not None:
                acdr = AnalyseCDR(int_aas, file_name=cdrfile)
                cdr_output = acdr.analyse()
                fo = open(wdir + "/" + tag + "cdr_analysis.html", "w")
                fo.write(cdr_output)
                fo.close()
        except:
            print "Warning: CDRs were not analysed: " + str(sys.exc_info()[1])

        try:
            gc.collect()
            RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile.new", wdir + "/" + tag + "annotated_treefile.png")
            gc.collect()
            RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile.new", wdir + "/" + tag + "annotated_treefile.svg")
            gc.collect()
            if cdrfile is not None:
                RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile_sum.new", wdir + "/" + tag + "annotated_treefile_sum.png")
                gc.collect()
                RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile_sum.new", wdir + "/" + tag + "annotated_treefile_sum.svg")
                gc.collect()
            RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile_tot.new", wdir + "/" + tag + "annotated_treefile_tot.png")
            gc.collect()
            RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile_tot.new", wdir + "/" + tag + "annotated_treefile_tot.svg")
            gc.collect()
            RenderTree.render_annotate(wdir + "/" + tag + "intermediates_treefile.new", wdir + "/" + tag + "intermediates_treefile.png")
            gc.collect()
            RenderTree.render_annotate(wdir + "/" + tag + "intermediates_treefile.new", wdir + "/" + tag + "intermediates_treefile.svg")
            gc.collect()
        except:
            print "Error rendering trees: " + str(sys.exc_info()[1])

        first = True
        orig_recs = []
        for rec in SeqIO.parse(wdir + "/" + tag + "aa_alignment.fa", "fasta"):
            if not first and "node_" not in rec.id:
                orig_recs.append(rec)
            first = False
        
        logo_alignment_file = wdir + "/" + tag + "alignment_for_logo.fa"
        SeqIO.write(orig_recs, wdir + "/" + tag + "alignment_for_logo.fa", "fasta")
        
        with open(wdir + "/" + tag + "weblogo_status.txt", "w") as fo:
            retcode = subprocess.call("seqlogo -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS" % tag, cwd=wdir, shell=True, stdout=fo, stderr=subprocess.STDOUT)
            if retcode == 1:
                fo.write("Trying seqlogo.pl instead.\n")
                retcode = subprocess.call("seqlogo.pl -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS" % tag, cwd=wdir, shell=True, stdout=fo, stderr=subprocess.STDOUT)
            if retcode == 1:
                print "Weblogo not installed: logo plot will not be generated."
 def newFunction(*other, **args):
     if not isinstance(other[0], Alignment):
         other = list(other)
         other[0] = Alignment(other[0])
         other = tuple(other)
     return f(*other, **args)
Beispiel #26
0
    def run_dnaml(self,
                  seq_align,
                  ptree,
                  seqpattern,
                  cdrfile,
                  wdir,
                  rep,
                  tag=""):
        """Run dnaml. Arguments are:
           seq_align: the input nt sequences (MultipleSequenceAlignment)
           ptree: phylogenetic tree (Bio.Phylo)
           seqpattern: A list of sequence number directives, in the format accepted by Alignment.set_position_numbers
           wdir: the name of a directory that run_paml should use. This must exist already.
           rep: a function that takes a string as an argument. This will be called should an error or warning be
                necessary (may be called multiple times in one invocation).
           tag: an optional tag to prefix filenames with

           Sequences in seq_align must be the same length, must start on a codon boundary, and be an integral number
           of codons in length. The first sequence must be the ancestral sequence or outgroup. Exactly he same sequence
           names must occur in the alignment and the tree. Sequence name format is pretty flexible (sequences are
           mapped to names acceptable to PAML and remapped after PAML has run).
        """
        root_id = seq_align[0].id

        # Translate clade names to something safe
        namedict = {}
        serial = 1

        for seq in seq_align:
            namedict[seq.id] = "N%09d" % serial
            seq.id = namedict[seq.id]
            serial += 1

        qtree = copy.deepcopy(ptree)

        for clade in qtree.get_terminals():
            if clade.name and clade.name in namedict:
                clade.name = namedict[clade.name]

        # Root the tree on the first record

        first = "N%09d" % 1

        try:
            qtree.root_with_outgroup(
                qtree.find_clades(name=re.escape(first)).next())
        except:
            raise ValueError("Error: root sequence not found in tree.")

        try:
            inv_dict = {v: k for k, v in namedict.items()}

            ptree.root_with_outgroup(
                ptree.find_clades(name=re.escape(inv_dict[first])))
            Phylo.write(ptree,
                        wdir + "/" + "input_treefile.new",
                        "newick",
                        plain=False)
        except:
            raise ValueError(
                "Error rooting trees: check for corrupt tree file or duplicated sequences."
            )

        # Write the sequences, in PHYLIP format (real PHYLIP format, as used by PHYLIP!)

        with open(wdir + "/" + "infile", "w") as f:
            f.write("  %d  %d\n" % (len(seq_align), len(seq_align[0])))
            for seq in seq_align:
                f.write("%10s%s\n" % (seq.id, seq.seq.upper()))

        # Write the tree file

        Phylo.write(qtree, wdir + "/" + "intree", "newick")

        if os.path.exists(wdir + "/" + "outfile"):
            os.remove(wdir + "/" + "outfile")
        if os.path.exists(wdir + "/" + "outtree"):
            os.remove(wdir + "/" + "outtree")

        # The path to the module may reference either a .py or a .pyc file...

        ctlfile = os.path.abspath(__file__).replace(".pyc", ".ctl") if ".pyc" in os.path.abspath(__file__) \
            else os.path.abspath(__file__).replace(".py", ".ctl")

        # Check for dnaml in the current directory

        dnamlfile = os.path.abspath(__file__).replace("Dnaml.pyc", "dnaml") if ".pyc" in os.path.abspath(__file__) \
            else os.path.abspath(__file__).replace("Dnaml.py", "dnaml")

        if not os.path.exists(dnamlfile):
            dnamlfile = "dnaml"  # must be on the path somewhere

        with open(wdir + "/" + "dnaml.txt", "w") as o, open(ctlfile, "r") as i:
            subprocess.call(dnamlfile, cwd=wdir, stdin=i, stdout=o)

        if not os.path.isfile(wdir + "/" + "outfile"):
            rep("No output returned by dnaml: please check the logs for the issue."
                )
            return None

        if os.path.isfile(wdir + "/" + "outfile.txt"):
            os.remove(wdir + "/" + "outfile.txt")
        os.rename(wdir + "/" + "outfile", wdir + "/" + "outfile.txt")

        intseqs = self.__parse_outfile(wdir + "/" + "outfile.txt")

        if not intseqs:
            rep("Unexpected output returned by dnaml: please check the logs for the issue."
                )
            return None

        # Custom sort function to put the root record first, then others supplied by the user, then intermediate nodes
        def key_ids(rec):
            if rec.id == "N%09d" % 1:
                return 'a__' + rec.id
            elif 'node_' in rec.id:
                return 'z__' + "%04d" % (int)(rec.id.split("_")[1])
            else:
                return 'l__' + rec.id

        labelled_tree = Phylo.read(wdir + "/" + "outtree", "newick")
        intseqs.seek(0)
        int_seqs = Alignment(file_name=intseqs, format="fasta")
        int_seqs.sort(key=key_ids)
        intseqs.seek(0)
        int_aas = Alignment()
        int_aas.read_nt(intseqs, "fasta")
        int_aas.sort(key=key_ids)
        int_aas.set_position_numbers(position_numbers=seqpattern)

        # Put back the original names in all our collections

        for seq in int_seqs:
            if seq.id in inv_dict:
                seq.id = inv_dict[seq.id]
            seq.name = ""
            seq.description = ""

        for seq in int_aas:
            if seq.id in inv_dict:
                seq.id = inv_dict[seq.id]
            seq.name = ""
            seq.description = ""

        nodeid = 1
        for clade in labelled_tree.find_clades(order="preorder"):
            if clade.name is None:
                clade.name = "node_%d" % nodeid  # This relies on our traversal using the same order as dnaml
                nodeid += 1
            else:
                if clade.name in inv_dict:
                    clade.name = inv_dict[clade.name]

        # Now we need to map the labelling of the nodes in the labelled tree to the nodes in the original tree

        self.__map_names(ptree, labelled_tree)
        Phylo.write(ptree,
                    wdir + "/" + tag + "intermediates_treefile.new",
                    "newick",
                    plain=False)

        cladenames = []
        new_int_aas = Alignment()

        for clade in ptree.find_clades():
            if clade.name is not None:
                cladenames.append(clade.name)

        for rec in int_aas:
            if rec.id in cladenames:
                new_int_aas.append(rec)

        int_aas = new_int_aas
        int_aas.set_position_numbers(position_numbers=seqpattern)

        copy_tree = copy.deepcopy(ptree)
        # Calculate AA diffs between each node and its parent, and write to the tree

        labels = {}

        def diffkey(diff):
            return int_aas.index_of(diff[1:-1])

        for clade in ptree.find_clades():
            if clade.name is not None:
                parent = self.__get_parent(ptree, clade)

                if parent is None:
                    path = ptree.get_path(clade)
                    if len(path) == 1 and clade.name != first:
                        fname = inv_dict[first]
                        parent = ptree.find_clades(
                            name=re.escape(fname)).next()

                if parent is not None and parent.name is not None:
                    diffs = list(int_aas.seqdiff(clade.name, parent.name))
                    diffs.sort(key=diffkey)
                    diffs = "+".join(diffs)
                    if "node_" in clade.name:
                        labels[clade.name] = diffs
                    else:
                        labels[clade.name] = str(clade.name) + " " + diffs

        for clade in ptree.find_clades():
            if clade.name is not None and clade.name in labels:
                clade.name = labels[clade.name]

        Phylo.write(ptree,
                    wdir + "/" + tag + "annotated_treefile.new",
                    "newick",
                    plain=False)

        # Now write a tree with summary CDR/FR total changes

        if cdrfile is not None:
            ptree = copy.deepcopy(copy_tree)
            acdr = AnalyseCDR(int_aas, file_name=cdrfile)
            labels = {}

            for clade in ptree.find_clades():
                if clade.name is not None:
                    parent = self.__get_parent(ptree, clade)

                    if parent is None:
                        path = ptree.get_path(clade)
                        if len(path) == 1 and clade.name != first:
                            fname = inv_dict[first]
                            parent = ptree.find_clades(
                                name=re.escape(fname)).next()

                    if parent is not None and parent.name is not None:
                        diffs = acdr.category_diff(clade.name, parent.name)
                        if "node_" in clade.name:
                            labels[clade.name] = diffs
                        else:
                            labels[clade.name] = str(clade.name) + " " + diffs

            for clade in ptree.find_clades():
                if clade.name is not None and clade.name in labels:
                    clade.name = labels[clade.name]

            Phylo.write(ptree,
                        wdir + "/" + tag + "annotated_treefile_sum.new",
                        "newick",
                        plain=False)

        # And write a tree with counts of total AA changes

        ptree = copy.deepcopy(copy_tree)
        labels = {}

        for clade in ptree.find_clades():
            if clade.name is not None:
                parent = self.__get_parent(ptree, clade)

                if parent is None:
                    path = ptree.get_path(clade)
                    if len(path) == 1 and clade.name != first:
                        fname = inv_dict[first]
                        parent = ptree.find_clades(
                            name=re.escape(fname)).next()

                if parent is not None and parent.name is not None:
                    diffs = list(int_aas.seqdiff(clade.name, parent.name))
                    if "node_" in clade.name:
                        labels[clade.name] = str(
                            len(diffs)) if len(diffs) > 0 else ""
                    else:
                        labels[clade.name] = str(clade.name) + (
                            " " + str(len(diffs)) if len(diffs) > 0 else "")

        for clade in ptree.find_clades():
            if clade.name is not None and clade.name in labels:
                clade.name = labels[clade.name]

        Phylo.write(ptree,
                    wdir + "/" + tag + "annotated_treefile_tot.new",
                    "newick",
                    plain=False)

        f = open(wdir + "/" + tag + "aa_alignment.txt", "w")
        f.write(int_aas.report(100))
        f.close()

        f = open(wdir + "/" + tag + "nt_alignment.txt", "w")
        f.write(int_seqs.report(100))
        f.close()

        for rec in int_aas:
            rec.description = ""

        AlignIO.write(int_aas, wdir + "/" + tag + "aa_alignment.fa", "fasta")
        AlignIO.write(int_seqs, wdir + "/" + tag + "nt_alignment.fa", "fasta")
        return int_aas
Beispiel #27
0
    def run_dnaml(self, seq_align, ptree, seqpattern, cdrfile, wdir, rep, tag=""):
        """Run dnaml. Arguments are:
           seq_align: the input nt sequences (MultipleSequenceAlignment)
           ptree: phylogenetic tree (Bio.Phylo)
           seqpattern: A list of sequence number directives, in the format accepted by Alignment.set_position_numbers
           wdir: the name of a directory that run_paml should use. This must exist already.
           rep: a function that takes a string as an argument. This will be called should an error or warning be
                necessary (may be called multiple times in one invocation).
           tag: an optional tag to prefix filenames with

           Sequences in seq_align must be the same length, must start on a codon boundary, and be an integral number
           of codons in length. The first sequence must be the ancestral sequence or outgroup. Exactly he same sequence
           names must occur in the alignment and the tree. Sequence name format is pretty flexible (sequences are
           mapped to names acceptable to PAML and remapped after PAML has run).
        """
        root_id = seq_align[0].id

        # Translate clade names to something safe
        namedict = {}
        serial = 1

        for seq in seq_align:
            namedict[seq.id] = "N%09d" % serial
            seq.id = namedict[seq.id]
            serial += 1

        qtree = copy.deepcopy(ptree)

        for clade in qtree.get_terminals():
            if clade.name and clade.name in namedict:
                clade.name = namedict[clade.name]

        # Root the tree on the first record

        first = "N%09d" % 1
        
        try:
            qtree.root_with_outgroup(qtree.find_clades(name=re.escape(first)).next())
        except:
            raise ValueError("Error: root sequence not found in tree.")
                    
        try:
            inv_dict = {v: k for k, v in namedict.items()}
                
            ptree.root_with_outgroup(ptree.find_clades(name=re.escape(inv_dict[first])))
            Phylo.write(ptree, wdir + "/" + "input_treefile.new", "newick", plain=False)
        except:
            raise ValueError("Error rooting trees: check for corrupt tree file or duplicated sequences.")    

        # Write the sequences, in PHYLIP format (real PHYLIP format, as used by PHYLIP!)

        with open(wdir + "/" + "infile", "w") as f:
            f.write("  %d  %d\n" % (len(seq_align), len(seq_align[0])))
            for seq in seq_align:
                f.write("%10s%s\n" % (seq.id, seq.seq.upper()))

        # Write the tree file

        Phylo.write(qtree, wdir + "/" + "intree", "newick")

        if os.path.exists(wdir + "/" + "outfile"):
            os.remove(wdir + "/" + "outfile")
        if os.path.exists(wdir + "/" + "outtree"):
            os.remove(wdir + "/" + "outtree")

        # The path to the module may reference either a .py or a .pyc file...
        
        ctlfile = os.path.abspath(__file__).replace(".pyc", ".ctl") if ".pyc" in os.path.abspath(__file__) \
            else os.path.abspath(__file__).replace(".py", ".ctl")
        
        # Check for dnaml in the current directory
        
        dnamlfile = os.path.abspath(__file__).replace("Dnaml.pyc", "dnaml") if ".pyc" in os.path.abspath(__file__) \
            else os.path.abspath(__file__).replace("Dnaml.py", "dnaml")
        
        if not os.path.exists(dnamlfile):
            dnamlfile = "dnaml" # must be on the path somewhere
        
        with open(wdir + "/" + "dnaml.txt", "w") as o, open(ctlfile, "r") as i:
            subprocess.call(dnamlfile, cwd=wdir, stdin = i, stdout=o)

        if not os.path.isfile(wdir + "/" + "outfile"):
            rep("No output returned by dnaml: please check the logs for the issue.")
            return None

        if os.path.isfile(wdir + "/" + "outfile.txt"):
            os.remove(wdir + "/" + "outfile.txt")
        os.rename(wdir + "/" + "outfile", wdir + "/" + "outfile.txt")

        intseqs = self.__parse_outfile(wdir + "/" + "outfile.txt")

        if not intseqs:
            rep("Unexpected output returned by dnaml: please check the logs for the issue.")
            return None

        # Custom sort function to put the root record first, then others supplied by the user, then intermediate nodes
        def key_ids(rec):
            if rec.id == "N%09d" % 1:
                return 'a__' + rec.id
            elif 'node_' in rec.id:
                return 'z__' + "%04d" % (int)(rec.id.split("_")[1])
            else:
                return 'l__' + rec.id

        labelled_tree = Phylo.read(wdir + "/" + "outtree", "newick")
        intseqs.seek(0)
        int_seqs = Alignment(file_name=intseqs, format="fasta")
        int_seqs.sort(key=key_ids)
        intseqs.seek(0)
        int_aas = Alignment()
        int_aas.read_nt(intseqs, "fasta")
        int_aas.sort(key=key_ids)
        int_aas.set_position_numbers(position_numbers = seqpattern)

        # Put back the original names in all our collections

        for seq in int_seqs:
            if seq.id in inv_dict:
                seq.id = inv_dict[seq.id]
            seq.name = ""
            seq.description = ""

        for seq in int_aas:
            if seq.id in inv_dict:
                seq.id = inv_dict[seq.id]
            seq.name = ""
            seq.description = ""

        nodeid = 1
        for clade in labelled_tree.find_clades(order="preorder"):
            if clade.name is None:
                clade.name = "node_%d" % nodeid            # This relies on our traversal using the same order as dnaml
                nodeid += 1
            else:
                if clade.name in inv_dict:
                    clade.name = inv_dict[clade.name]

        # Now we need to map the labelling of the nodes in the labelled tree to the nodes in the original tree

        self.__map_names(ptree, labelled_tree)
        Phylo.write(ptree, wdir + "/" + tag + "intermediates_treefile.new", "newick", plain=False)

        cladenames = []
        new_int_aas = Alignment()

        for clade in ptree.find_clades():
            if clade.name is not None:
                cladenames.append(clade.name)

        for rec in int_aas:
            if rec.id in cladenames:
                new_int_aas.append(rec)

        int_aas = new_int_aas
        int_aas.set_position_numbers(position_numbers = seqpattern)

        copy_tree = copy.deepcopy(ptree)
        # Calculate AA diffs between each node and its parent, and write to the tree

        labels = {}

        def diffkey(diff):
            return int_aas.index_of(diff[1:-1])

        for clade in ptree.find_clades():
            if clade.name is not None:
                parent = self.__get_parent(ptree, clade)

                if parent is None:
                    path = ptree.get_path(clade)
                    if len(path) == 1 and clade.name != first:
                        fname = inv_dict[first]
                        parent = ptree.find_clades(name = re.escape(fname)).next()

                if parent is not None and parent.name is not None:
                    diffs = list(int_aas.seqdiff(clade.name, parent.name))
                    diffs.sort(key = diffkey)
                    diffs = "+".join(diffs)
                    if "node_" in clade.name:
                        labels[clade.name] = diffs
                    else:
                        labels[clade.name] = str(clade.name) + " " + diffs

        for clade in ptree.find_clades():
            if clade.name is not None and clade.name in labels:
                clade.name = labels[clade.name]

        Phylo.write(ptree, wdir + "/" + tag + "annotated_treefile.new", "newick", plain=False)

        # Now write a tree with summary CDR/FR total changes

        if cdrfile is not None:
            ptree = copy.deepcopy(copy_tree)
            acdr = AnalyseCDR(int_aas, file_name=cdrfile)
            labels = {}
    
            for clade in ptree.find_clades():
                if clade.name is not None:
                    parent = self.__get_parent(ptree, clade)
    
                    if parent is None:
                        path = ptree.get_path(clade)
                        if len(path) == 1 and clade.name != first:
                            fname = inv_dict[first]
                            parent = ptree.find_clades(name = re.escape(fname)).next()
    
                    if parent is not None and parent.name is not None:
                        diffs = acdr.category_diff(clade.name, parent.name)
                        if "node_" in clade.name:
                            labels[clade.name] = diffs
                        else:
                            labels[clade.name] = str(clade.name) + " " + diffs
    
            for clade in ptree.find_clades():
                if clade.name is not None and clade.name in labels:
                    clade.name = labels[clade.name]
    
            Phylo.write(ptree, wdir + "/" + tag + "annotated_treefile_sum.new", "newick", plain=False)

        # And write a tree with counts of total AA changes

        ptree = copy.deepcopy(copy_tree)
        labels = {}

        for clade in ptree.find_clades():
            if clade.name is not None:
                parent = self.__get_parent(ptree, clade)

                if parent is None:
                    path = ptree.get_path(clade)
                    if len(path) == 1 and clade.name != first:
                        fname = inv_dict[first]
                        parent = ptree.find_clades(name = re.escape(fname)).next()

                if parent is not None and parent.name is not None:
                    diffs = list(int_aas.seqdiff(clade.name, parent.name))
                    if "node_" in clade.name:
                        labels[clade.name] = str(len(diffs)) if len(diffs) > 0 else ""
                    else:
                        labels[clade.name] = str(clade.name) + (" " + str(len(diffs)) if len(diffs) > 0 else "")

        for clade in ptree.find_clades():
            if clade.name is not None and clade.name in labels:
                clade.name = labels[clade.name]

        Phylo.write(ptree, wdir + "/" + tag + "annotated_treefile_tot.new", "newick", plain=False)

        f = open(wdir + "/" + tag + "aa_alignment.txt", "w")
        f.write(int_aas.report(100))
        f.close()
        
        f = open(wdir + "/" + tag + "nt_alignment.txt", "w")
        f.write(int_seqs.report(100))
        f.close()

        for rec in int_aas:
            rec.description = ""

        AlignIO.write(int_aas, wdir + "/" + tag + "aa_alignment.fa", "fasta")
        AlignIO.write(int_seqs, wdir + "/" + tag + "nt_alignment.fa", "fasta")
        return int_aas
Beispiel #28
0
def main(argv):
    print "AnnotateTreeCmd v1.0"
    if len(argv) == 2 and argv[1] == '-t':
        conduct_tests()
        exit(0)
    elif len(argv) != 7:
        print 'usage python AnnotateTreeCmd.py seqnumfile seqfile treefile cdrfile tag wd.'
        sys.exit(0)

    for file in argv[1:4]:
        check_file(file)

    (seqnumfile, seqfile, treefile, cdrfile, tag, wdir) = argv[1:7]

    if len(cdrfile) > 0:
        check_file(cdrfile)
    else:
        cdrfile = None

    try:
        if not os.path.exists(wdir):
            os.makedirs(wdir)
    except:
        print "Error creating directory %s." % wdir
        sys.exit(0)

    try:
        msa = Alignment()
        msa.read_nt(
            seqfile)  # Check that the sequence comprises a valid set of codons
        for seq in msa:
            if '*' in seq:
                print "Stop codon found in sequence %s." % seq.id
                sys.exit(0)
    except:
        print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1])
        sys.exit(0)

    try:
        seq_pos = msa.read_position_numbers(seqnumfile)
    except:
        print "Error parsing %s: %s." % (seqnumfile, sys.exc_info()[1])
        sys.exit(0)

    if cdrfile is not None:
        try:
            acdr = AnalyseCDR(msa, file_name=cdrfile)
        except:
            print "Error parsing %s: %s." % (cdrfile, sys.exc_info()[1])
            sys.exit(0)

    try:
        seq_align = AlignIO.read(seqfile, "fasta")
    except:
        try:
            seq_align = AlignIO.read(seqfile, "phylip")
        except:
            print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1])
            sys.exit(0)

    try:
        tree = Phylo.read(treefile, "newick")
    except:
        print "Error parsing %s: %s." % (treefile, sys.exc_info()[1])
        sys.exit(0)

    dnaml = Dnaml()

    int_aas = dnaml.run_dnaml(seq_align, tree, seq_pos, cdrfile, wdir, report,
                              tag)

    if int_aas is not None:
        try:
            if cdrfile is not None:
                acdr = AnalyseCDR(int_aas, file_name=cdrfile)
                cdr_output = acdr.analyse()
                fo = open(wdir + "/" + tag + "cdr_analysis.html", "w")
                fo.write(cdr_output)
                fo.close()
        except:
            print "Warning: CDRs were not analysed: " + str(sys.exc_info()[1])

        try:
            gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "annotated_treefile.new",
                wdir + "/" + tag + "annotated_treefile.png")
            gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "annotated_treefile.new",
                wdir + "/" + tag + "annotated_treefile.svg")
            gc.collect()
            if cdrfile is not None:
                RenderTree.render_annotate(
                    wdir + "/" + tag + "annotated_treefile_sum.new",
                    wdir + "/" + tag + "annotated_treefile_sum.png")
                gc.collect()
                RenderTree.render_annotate(
                    wdir + "/" + tag + "annotated_treefile_sum.new",
                    wdir + "/" + tag + "annotated_treefile_sum.svg")
                gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "annotated_treefile_tot.new",
                wdir + "/" + tag + "annotated_treefile_tot.png")
            gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "annotated_treefile_tot.new",
                wdir + "/" + tag + "annotated_treefile_tot.svg")
            gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "intermediates_treefile.new",
                wdir + "/" + tag + "intermediates_treefile.png")
            gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "intermediates_treefile.new",
                wdir + "/" + tag + "intermediates_treefile.svg")
            gc.collect()
        except:
            print "Error rendering trees: " + str(sys.exc_info()[1])

        first = True
        orig_recs = []
        for rec in SeqIO.parse(wdir + "/" + tag + "aa_alignment.fa", "fasta"):
            if not first and "node_" not in rec.id:
                orig_recs.append(rec)
            first = False

        logo_alignment_file = wdir + "/" + tag + "alignment_for_logo.fa"
        SeqIO.write(orig_recs, wdir + "/" + tag + "alignment_for_logo.fa",
                    "fasta")

        with open(wdir + "/" + tag + "weblogo_status.txt", "w") as fo:
            retcode = subprocess.call(
                "seqlogo -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS"
                % tag,
                cwd=wdir,
                shell=True,
                stdout=fo,
                stderr=subprocess.STDOUT)
            if retcode == 1:
                fo.write("Trying seqlogo.pl instead.\n")
                retcode = subprocess.call(
                    "seqlogo.pl -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS"
                    % tag,
                    cwd=wdir,
                    shell=True,
                    stdout=fo,
                    stderr=subprocess.STDOUT)
            if retcode == 1:
                print "Weblogo not installed: logo plot will not be generated."
Beispiel #29
0
from Alignment import Alignment

from Role import Role
from Roles.Medic import Medic
from Roles.Vigi import Vigi
from Roles.Cop import Cop
from Roles.Smart_Hunter_Ven import Smart_Hunter_Ven
from Roles.Innocent_Child import Innocent_Child
from Roles.Possum import Possum

import Strategy
from Strategies.VigiStrats import VigiStratSimpleActivation
from Strategies.VigiStrats import VigiStratSmartActivation

ALIGNMENT_TOWN = Alignment(name="Town",
                           win_con="Last Faction Standing",
                           night_action="None")
ALIGNMENT_MAFIA_2KP2 = Alignment(name="Mafia",
                                 win_con="Parity",
                                 night_action="2 kp till 2")
ALIGNMENT_MAFIA = Alignment(name="Mafia",
                            win_con="Parity",
                            night_action="1 kp")

MEDIC = Medic(name="Medic", active_n0=True, heal_self=False)
ONESHOTVIGI = Vigi(name="One Shot Vigi", active_n0=True, charges=1)
ONESHOTVIGINON0 = Vigi(name="One Shot Vigi", active_n0=False, charges=1)

COP = Cop(name="Full Alignment Cop", active_n0=True, target_self=False)
SMART_HUNTER_VEN = Smart_Hunter_Ven(name="Smart Hunter")
INNOCENT_CHILD = Innocent_Child(name="Inno Child")
Beispiel #30
0
    if args.hand:
        hand_alignment = tfolder.get_alignment([args.lang1, args.lang2],
                                               backend='hand')
        forced_rungs = hand_alignment.as_ladder()
        print >> sys.stderr, "%d hand-aligned pairs found." % len(forced_rungs)
    # prealign
    if args.prealign:
        pre_alignment = list(find_matches(t1, t2, threshold=0.5, pair_count=100))
        forced_rungs.extend(pre_alignment)
        print >> sys.stderr, "%d sentence pairs matched." % len(pre_alignment)
    forced_rungs = sorted(set(forced_rungs))

    try:
        a = None
        a = make_composed_alignment(t1, t2, forced_rungs)
        a = Alignment(a)
    finally:
        output_filename = '%s/%s-%s.my' % (args.folder, args.lang1, args.lang2)
        if not a:
            raise SystemExit
        with open(output_filename, 'w') as f:
            for i, j, c in a.data:
                f.write("%d\t%d\t%.2f\n" % (i, j, c))
        print >> sys.stderr, "Wrote alignment to %s." % output_filename
        c = a.summed_cost()
        print >> sys.stderr, "Total cost", c, "avg", c/len(a.data)

        if args.plot:
            import plot
            plot.plot_cost_matrix(cost,
                                  a.as_ladder(),
primer_range = Primer_range(BED_FILE)

# [+strand alignment, -strand alignment]
alignment_bucket = [None, None]

out_buffer1 = []
out_buffer2 = []
cnt = 0
current_read = 'default'  # Current read name
for sam_line in sys.stdin:
    # Skip header
    if sam_line.startswith('@'):
        continue

    alignment = Alignment(sam_line.rstrip())

    # Initialize alignment backet
    if current_read != alignment.read_name:
        alignment_bucket = [None, None]
        current_read = alignment.read_name

    # Skip non-primary and supplemental alignments
    if alignment.flag & (256 + 2048):
        continue

    if alignment.strand == '+':
        alignment_bucket[0] = alignment
    elif alignment.strand == '-':
        alignment_bucket[1] = alignment
Beispiel #32
0
from Alignment import Alignment

<<<<<<< HEAD
msg = "welcome"
print(msg)

Alignment.test()
=======
msg = "welcome to my world, MK"
print(msg)

msg = "Hurry up...No time to lose!!!"
print(msg)


<<<<<<< HEAD
#Alignment.test()
>>>>>>> c55a02934fa8fbc5245c4c5de66bc85687866180
=======
Alignment.test()
>>>>>>> 29b3891cc91b51c02f0d91a4606f93c1058bba86
	def getTemplates(self):
		# http://biopython.org/DIST/docs/api/Bio.Blast.NCBIWWW-module.html
		if self.debug:
			print(self.seq)
		# Send BLAST request to server
		# Use blastp (protein) for the method
		# Use pdb as the database
		result_handle = NCBIWWW.qblast("blastp","pdb",str(self.seq),expect=0.01)
		# Parse the results into blast records
		blast_records = NCBIXML.parse(result_handle)
		if self.debug:
			print("BLAST Request Finished")

		# Read through each blast record
		for record in blast_records:
			# Grab the alignments from each record
			for alignment in record.alignments:
				# Use the alignment id as the template key
				id = alignment.accession
				fasta = self.getFastaFromId(id)
				title = alignment.title
				length = alignment.length
				# Set up the template object for this id
				template = Template(
					id=id,fasta=fasta,sequence=title,
					length=length,alignments=[]
				)
				# Store the template in the template dict
				self.templates[id] = template
				"""
				self.templates[id] = {"fasta":self.getFastaFromId(id),
					'asequence':alignment.title,
					'alength':alignment.length,
					"alignments":[]}
				"""
				# Store fasta in dict
				self.fastas[id] = fasta
				# Get all alignments for this template
				for hsp in alignment.hsps:
					# Create an alignment object
					a = Alignment(
						id=id,title=title,expect=hsp.expect,score=hsp.score,
						identities=hsp.identities,similarity=(100*hsp.identities/len(self.seq)),
						target=hsp.query,targetstart=hsp.query_start,match=hsp.match,
						template=hsp.sbjct,templatestart=hsp.sbjct_start,length=length
					)
					# Alignment isn't necessarily the same size as the sequence
					targetfront = str(self.seq[:a.targetstart-1])
					targetend = str(self.seq[(a.targetstart+a.length):])
					a.target = ''.join(targetfront) + a.target + ''.join(targetend)
					a.length = len(a.target)
					
					templatefront = ['-']*(a.targetstart-1)
					templateend = ['-']*(len(self.seq)-(a.targetstart+a.length))
					a.template = ''.join(templatefront) + a.template + ''.join(templateend)

					if self.debug:
						print("Seq vs Target Length:",len(self.seq),a.length)

					# Append the alignment to the template's alignments
					self.templates[id].alignments.append(a)
					self.alignments.append(a)
					"""
					self.templates[id]["alignments"].append({'expect':hsp.expect,
						'score':hsp.score,
						'identities':hsp.identities,
						'similarity':(100*hsp.identities/len(self.seq)),
						'target':hsp.query,
						'match':hsp.match,
						'template':hsp.sbjct})
					"""

					if self.debug:
						print()
						print('****ALIGNMENT***')
						print('id:',id)
						print('sequence:', title)
						print('length:',length)
						print('e value:', hsp.expect)
						print('score:', hsp.score)
						print('identities:',(100*hsp.identities/len(self.seq))) # need to print percentage of similarities
						print("Target  :" + hsp.query[0:75] + '...')
						print("Match   :" + hsp.match[0:75] + '...')
						print("Template:" + hsp.sbjct[0:75] + '...')
						print()

		# Save off the fasta file
		for id,fasta in self.fastas.items():
			fname = '%s/%s.fasta' % (self.fastasfolder,id)
			if not os.path.exists(fname):
				f = open(fname,'w')
				SeqIO.write(fasta,f,'fasta')
				f.close()

		# Save off the alignments
		for i,a in enumerate(self.alignments):
			fname = '%s/%s-%s.alignment' % (self.alignmentsfolder,a.id,str(i))
			if not os.path.exists(fname):
				f = open(fname,'w')
				json.dump(a.toJSON(),f)
				f.close()

		return self.templates.keys()
Beispiel #34
0
#!/usr/bin/env python3
# Usage: ./msa_approx.py score_matrix 5 test.fa
# Requires Alignment.py in the directory

from Alignment import Alignment
from Alignment import GetArguments
import sys

arguments = GetArguments(sys.argv)
test = Alignment(arguments.seqs, arguments.score_matrix, arguments.gapcost)
alignm = test.multiple_align()

for i in range(len(alignm)):
    print(">", arguments.heads[test.seqOrder[i]])
    print(test.num_to_sequence(alignm[i]), "\n")
Beispiel #35
0
    def gen():
        yield (0, 0, 0)
        prev_i2 = 0
        for (i1, i2) in al12:
            for _i2 in range(prev_i2+1, i2+1):
                try:
                    i3s = map23[_i2]
                    for i3 in i3s:
                        # if map31[i3] == i1:
                        yield (i1, _i2, i3)
                except KeyError:
                    pass
            prev_i2 = i2
    return Alignment(list(gen()), no_costs=True)

if __name__ == '__main__':
    import sys
    name = sys.argv[1]

    a1 = Alignment.from_file(name + '/pl-cu.my').as_ladder()
    a2 = Alignment.from_file(name + '/cu-el.my').as_ladder()
    a3 = Alignment.from_file(name + '/pl-el.my').as_ladder()
    a3 = [(b, a) for (a, b) in a3]

    ma = merge_3_alignments(a1, a2, a3)

    ma.pretty_print(Text.from_file(name + '/pl.txt', lang='pl').as_sentences_flat(),
                    Text.from_file(name + '/cu.txt', lang='cu').as_sentences_flat(),
                    Text.from_file(name + '/el.txt', lang='el').as_sentences_flat())