Esempio n. 1
0
    def sort_by_tree(self):
        tree_filename = tkFileDialog.askopenfilename(initialdir = self.host.settings.work_dir, filetypes = (("Inkscape vector file", "*.svg"), ))
        if tree_filename == "": # Cancel
            return
        old_order_seqs = Aln_basic.read_fasta_from_strings(self.fixed.get_strings())

        sort_and_color_path = os.path.join(self.host.settings.script_dir, "sort_and_color.py")
        fixed_filename = os.path.join(self.host.settings.work_dir, "%s.fixed" % self.host.temp_name)
        Aln_basic.write_widget_into_file(self.fixed.text_widget, fixed_filename)

        if self.host.verbose.get():
            os.system("%s -i %s -w %s -o %s -t %s" % (sort_and_color_path, os.path.basename(fixed_filename), self.host.settings.work_dir, 
                                                      self.host.temp_name, tree_filename))
        else:
            os.system("%s -i %s -w %s -o %s -t %s 1> nul 2> nul" % (sort_and_color_path, os.path.basename(fixed_filename), 
                                                                    self.host.settings.work_dir, self.host.temp_name, tree_filename))

        temp_sorted_filename = os.path.join(self.host.settings.work_dir, "%s.tree_sorted" % self.host.temp_name)
        Aln_basic.read_widget_from_file(self.fixed.text_widget, temp_sorted_filename)
                
        self.IDs.text_widget.delete(1.0, tkinter.END)
        new_order_seqs = Aln_basic.read_fasta_from_strings(self.fixed.get_strings())
        new_ids = ""
        for s in new_order_seqs:
            new_ids += "%s\n" % s.ID            
        self.IDs.text_widget.insert(tkinter.END, new_ids)

        print ("    Alignment was sorted according to the %s tree file!" % tree_filename)
        self.host.log_tab.write_to_log("Alignment was sorted according to the tree file:\n%s" % tree_filename, True)
        if len(new_order_seqs) != len(old_order_seqs):
            print ("    Number of sequences reduced from %i to %i! Possibly IDs in the tree file was interpreted badly" % (len(old_order_seqs), len(new_order_seqs)))
            self.host.set_status("Number of sequences reduced from %i to %i!" % (len(old_order_seqs), len(new_order_seqs)))
Esempio n. 2
0
    def apply_actions(self, ids_to_remove, ids_to_fix):
        print ("    Removement started...")
        seqs = None
        if self.seq_input_frame.text_is_empty():
            tkMessageBox.showwarning("Unaligned sequences are not provided", "Alnalyser cannot find unaligned sequences. They will be loaded from the alignment panel and unaligned. Consider building new alignment after this step!")
            seqs = Aln_basic.read_fasta_from_strings(self.aln_input_frame.get_strings(), True)
        else:
            seqs = Aln_basic.read_fasta_from_strings(self.seq_input_frame.get_strings())
            self.seq_input_frame.text_widget.delete(1.0, tkinter.END)
        reason_to_id = dict()
        r = 0

        no_org_remains = list()
        for s in seqs:
            if s.ID in ids_to_remove:
                curr_reason = ids_to_remove[s.ID][0]
                org_remains = ids_to_remove[s.ID][1]
                if not curr_reason in reason_to_id:
                    reason_to_id[curr_reason] = list()
                reason_to_id[curr_reason].append(s.ID)
                r += 1
                if not org_remains:
                    no_org_remains.append((s.ID, s.organism))
                continue
            self.seq_input_frame.text_widget.insert(tkinter.END, ">%s\n" % s.name)
            self.seq_input_frame.text_widget.insert(tkinter.END, s.sequence + "\n\n")

        curr_message = "Removement log: %i sequences removed from %i (%i remained)\n" % (r, len(seqs), len(seqs) - r)
        curr_message += self.host.purify_tab.get_curr_options()
        for reason in reason_to_id:
            curr_message += "Reason(s) - %s:\n" % reason
            for protein_id in reason_to_id[reason]:
                curr_message += "%s, " % protein_id
            curr_message = "%s\n" % (curr_message.strip(", "))
        curr_message += "\n"
        curr_message += "For %i removements no protein from this organism remained in the sample\n" % len(no_org_remains)
        for pair in no_org_remains:
            curr_message += "%s\t%s\n" % (pair[0], pair[1])
        self.host.log_tab.write_to_log(curr_message, True)

        print ("    [..DONE..] Total %i removes done (%i possible)" % (r, len(ids_to_remove.keys())))
Esempio n. 3
0
    def align(self):        
        if self.seq_input_frame.text_is_empty(): # No sequences were provided
            self.host.set_status("No sequences were provided to align!", "#FF0000")
            return
        print ("    Alignment construction started...")
        (muscle_name, muscle_path) = Settings.get_program_name(self.host.settings.muscle_dir, "muscle")
        unaligned_filename = os.path.join(self.host.settings.work_dir, "%s.fasta" % self.host.temp_name)
        Aln_basic.write_widget_into_file(self.seq_input_frame.text_widget, unaligned_filename)
        aligned_filename = os.path.join(self.host.settings.work_dir, "%s.aln" % self.host.temp_name)

        self.host.set_status("Alignment")
        maxiters_option = ""
        if self.maxiters.get() != "":
            try:
                maxiters_option = "-maxiters %i" % int(self.maxiters.get())
            except TypeError:
                print ("Option -maxiters is not an integer and is ignored!")
        gapopen_option = ""
        if self.gapopen.get() != "":
            if Aln_basic.is_negative_float(self.gapopen.get(), "-gapopen"):
                gapopen_option = "-gapopen %s" % self.gapopen.get()
        gapextend_option = ""
        if self.gapextend.get() != "":
            if Aln_basic.is_negative_float(self.gapextend.get(), "-gapextend"):
                gapextend_option = "-gapextend %s" % self.gapextend.get()
        muscle_command = "%s -in %s -out %s %s %s %s" % (muscle_path, unaligned_filename, aligned_filename, maxiters_option, gapopen_option, gapextend_option)
        print ("Muscle command to be ran:")
        print (muscle_command)        
        if self.host.verbose.get():                     
            os.system(muscle_command)
        else:
            os.system("%s 1> nul 2> nul" % muscle_command)

        Aln_basic.read_widget_from_file(self.aln_input_frame.text_widget, aligned_filename)
        if self.insert_blocks.get(): # Empty sequence >BLOCKS should be added
            curr_seqs = Aln_basic.read_fasta_from_strings(self.aln_input_frame.get_strings())
            self.aln_input_frame.text_widget.delete(1.0, tkinter.END)
            upd_aln_file = open(aligned_filename, "w")            
            upd_aln_file.write(">BLOCKS\n")
            upd_aln_file.write("%s\n\n" % ("-" * len(curr_seqs[0].sequence)))
            upd_aln_file.write(">SITE\n")
            upd_aln_file.write("%s\n\n" % ("-" * len(curr_seqs[0].sequence)))
            for s in curr_seqs:
                s.print_fasta(upd_aln_file, 60)
            upd_aln_file.close()
            Aln_basic.read_widget_from_file(self.aln_input_frame.text_widget, aligned_filename)            
                    
        self.host.set_status("Ready")

        os.remove(unaligned_filename)
        os.remove(aligned_filename)
        print ("    [..DONE..]")
Esempio n. 4
0
 def histo_to_log(self):
     values = list()
     seqs = Aln_basic.read_fasta_from_strings(self.pure.get_strings())
     for s in seqs:            
         values.append(len(s.sequence))
     info_string = "Length of proteins in the alignment"        
     begin = 0
     step = 0
     nsteps = 0
     try:
         begin = int(self.begin_entry.get())
         step = int(self.step_entry.get())
         nsteps = int(self.nsteps_entry.get())
     except ValueError:
         print ("    [..WARNING..] Enter proper begin, step and number of steps before printing!")
         return
     self.host.log_tab.write_histogram(values, info_string, begin, step, nsteps)
Esempio n. 5
0
    def filter_seq(self):
        """
        This method filters sequences in the input frame which have the same ID.
        Also could filter identical protein sequences.
        """
        seqs = Aln_basic.read_fasta_from_strings(self.seq_input_frame.get_strings())
        seq_ids_unique = dict()
        seqs_unique = dict()        
        i = 0
        r = 0
        s = 0
        seq_size = len(seqs)
        bad_ids = list()
        identical_seq_ids = dict()
        smooth = True        
        while i < len(seqs):
            if not seqs[i].ID in seq_ids_unique: # This is normal sequence            
                seq_ids_unique[seqs[i].ID] = seqs[i].sequence

            else:
                if seqs[i].sequence != seq_ids_unique[seqs[i].ID]: # Sequences differs
                    smooth = False
                    bad_ids.append(seqs[i].ID)                    
                else:
                    seqs.pop(i)
                    i -= 1
                    r += 1
            if not seqs[i].sequence in seqs_unique:
                seqs_unique[seqs[i].sequence] = True
            else:
                s += 1
                identical_seq_ids[seqs[i].ID] = True
            i += 1

        if len(identical_seq_ids) != 0:
            answer = tkMessageBox.askyesno("Filter identical sequences?", "We found %i sequences which are identical with some other sequence in alignment. Do you want to remove them?" % len(identical_seq_ids), icon = "question", parent = self)
            if answer == True:
                i = 0
                while i < len(seqs):
                    if seqs[i].ID in identical_seq_ids:
                        seqs.pop(i)
                        i -= 1
                        r += 1
                    i += 1                   

        self.seq_input_frame.text_widget.delete(1.0, tkinter.END)
        for s in seqs:
            self.seq_input_frame.text_widget.insert(tkinter.END, ">%s\n%s\n\n" % (s.name, s.sequence))

        curr_message = "Filtering of %i input sequences; %i sequences removed; %i remained\n" % (seq_size, r, len(seqs))
        if smooth:           
            self.host.set_status("Filtering gained success; %i sequences removed!" % r, self.host.header)
            curr_message += "Filtering gained success, all non-unique IDs removed\n"
        else:
            self.host.set_status("Filtering NOT gained success; see console for details!")
            print ("These sequences have duplicate IDs '%s' but different sequences; NOT removed:")
            curr_message += "Filtering NOT gained success, NOT all non-unique IDs removed; these remains:"
            for bad in bad_ids:
                print (bad)
                curr_message += "%s\n" % bad
        self.host.log_tab.write_to_log(curr_message, True)
Esempio n. 6
0
    def purify(self):
        #seqs = Aln_basic.read_fasta_from_strings(self.input_tab.aln_input_frame.get_strings())
        seqs = Aln_basic.read_fasta_from_strings(
            self.parse_tab.fixed.get_strings())
        if len(seqs) == 0:
            self.set_status("No alignment to purify!")
            return

        import udav_base
        self.set_status("Working")
        # --------------------------------------- 1) Calculating cut for mainly gappy parts of alignment
        max_name_length = 50
        separator = "  :  "
        presence_threshold = 50
        try:
            presence_threshold = int(self.purify_tab.presence_entry.get())
        except:
            print("Using default presence threshold value = 50!")

        (valid_start, valid_end) = Aln_basic.get_valid_alignment_range(
            seqs, presence_threshold)
        self.purify_tab.alignment.add_label_data("showing a region [%i; %i]" %
                                                 (valid_start + 1, valid_end))
        self.purify_tab.alignment.text_widget.delete(1.0, tkinter.END)

        # --------------------------------------- 2) Printing alignment into the text widget tab
        self.set_status("Printing alignment", "#FF0000")
        seqs_cut = list(
        )  # List of sequences with the mainly gappy parts of alignment cut
        id_to_org_and_seq = dict(
        )  # Hash of protein ids to a tuple of (0) their organism name and (1) cut sequences
        id_list = list()  # List of protein ids in order of their occurence
        for i in range(len(seqs)):
            # Printing to the widget
            fit_name = seqs[i].name[0:max_name_length]
            if len(fit_name) < max_name_length:
                fit_name += (max_name_length - len(fit_name)) * " "
            seq_part = seqs[i].sequence[valid_start:valid_end]
            seqs_cut.append(udav_base.Sequence(seqs[i].name, seq_part))
            string = fit_name + ("%s%s" % (separator, seq_part))
            self.purify_tab.alignment.text_widget.insert(
                tkinter.END, "%s\n" % string)
            # Saving data
            if seqs[i].ID in id_to_org_and_seq:
                print(
                    "    [..WARNING..] Non-unique ID '%s' detected; purification may work unproperly!"
                    % seqs[i].ID)

            curr_org_name = seqs[i].name.replace(seqs[i].ID, "")
            if re.match("^[^\|]+\|[^\|]+\|[^\|]+$", seqs[i].name):
                curr_org_name = seqs[i].name.split("|")[2]
            elif re.match("^[^\|]+\|[^\|]+$", seqs[i].name):
                curr_org_name = seqs[i].name.split("|")[1]

            id_to_org_and_seq[seqs[i].ID] = (curr_org_name, seqs[i].sequence)
            id_list.append(seqs[i].ID)

        self.purify_tab.id_to_org_and_seq = id_to_org_and_seq
        self.purify_tab.id_list = id_list
        self.purify_tab.seqs_cut = seqs_cut
        self.purify_tab.id_to_features = None
        self.purify_tab.featured_sequences = None
        self.purify_tab.valid_start = valid_start
        self.purify_tab.valid_end = valid_end
        self.purify_tab.name_length = max_name_length + len(separator)
        self.purify_tab.activate_buttons()
        #FIX: version 0.2.8 (self-hits data is set to default)
        self.purify_tab.evalue_threshold.delete(0, tkinter.END)
        self.purify_tab.evalue_threshold.configure(state=tkinter.DISABLED)
        self.purify_tab.sigma_num_self.delete(0, tkinter.END)
        self.purify_tab.sigma_num_self.configure(state=tkinter.DISABLED)

        del udav_base
        self.set_status("Ready")