def sort_by_tree(self): tree_filename = tkFileDialog.askopenfilename(initialdir = self.host.settings.work_dir, filetypes = (("Inkscape vector file", "*.svg"), )) if tree_filename == "": # Cancel return old_order_seqs = Aln_basic.read_fasta_from_strings(self.fixed.get_strings()) sort_and_color_path = os.path.join(self.host.settings.script_dir, "sort_and_color.py") fixed_filename = os.path.join(self.host.settings.work_dir, "%s.fixed" % self.host.temp_name) Aln_basic.write_widget_into_file(self.fixed.text_widget, fixed_filename) if self.host.verbose.get(): os.system("%s -i %s -w %s -o %s -t %s" % (sort_and_color_path, os.path.basename(fixed_filename), self.host.settings.work_dir, self.host.temp_name, tree_filename)) else: os.system("%s -i %s -w %s -o %s -t %s 1> nul 2> nul" % (sort_and_color_path, os.path.basename(fixed_filename), self.host.settings.work_dir, self.host.temp_name, tree_filename)) temp_sorted_filename = os.path.join(self.host.settings.work_dir, "%s.tree_sorted" % self.host.temp_name) Aln_basic.read_widget_from_file(self.fixed.text_widget, temp_sorted_filename) self.IDs.text_widget.delete(1.0, tkinter.END) new_order_seqs = Aln_basic.read_fasta_from_strings(self.fixed.get_strings()) new_ids = "" for s in new_order_seqs: new_ids += "%s\n" % s.ID self.IDs.text_widget.insert(tkinter.END, new_ids) print (" Alignment was sorted according to the %s tree file!" % tree_filename) self.host.log_tab.write_to_log("Alignment was sorted according to the tree file:\n%s" % tree_filename, True) if len(new_order_seqs) != len(old_order_seqs): print (" Number of sequences reduced from %i to %i! Possibly IDs in the tree file was interpreted badly" % (len(old_order_seqs), len(new_order_seqs))) self.host.set_status("Number of sequences reduced from %i to %i!" % (len(old_order_seqs), len(new_order_seqs)))
def apply_actions(self, ids_to_remove, ids_to_fix): print (" Removement started...") seqs = None if self.seq_input_frame.text_is_empty(): tkMessageBox.showwarning("Unaligned sequences are not provided", "Alnalyser cannot find unaligned sequences. They will be loaded from the alignment panel and unaligned. Consider building new alignment after this step!") seqs = Aln_basic.read_fasta_from_strings(self.aln_input_frame.get_strings(), True) else: seqs = Aln_basic.read_fasta_from_strings(self.seq_input_frame.get_strings()) self.seq_input_frame.text_widget.delete(1.0, tkinter.END) reason_to_id = dict() r = 0 no_org_remains = list() for s in seqs: if s.ID in ids_to_remove: curr_reason = ids_to_remove[s.ID][0] org_remains = ids_to_remove[s.ID][1] if not curr_reason in reason_to_id: reason_to_id[curr_reason] = list() reason_to_id[curr_reason].append(s.ID) r += 1 if not org_remains: no_org_remains.append((s.ID, s.organism)) continue self.seq_input_frame.text_widget.insert(tkinter.END, ">%s\n" % s.name) self.seq_input_frame.text_widget.insert(tkinter.END, s.sequence + "\n\n") curr_message = "Removement log: %i sequences removed from %i (%i remained)\n" % (r, len(seqs), len(seqs) - r) curr_message += self.host.purify_tab.get_curr_options() for reason in reason_to_id: curr_message += "Reason(s) - %s:\n" % reason for protein_id in reason_to_id[reason]: curr_message += "%s, " % protein_id curr_message = "%s\n" % (curr_message.strip(", ")) curr_message += "\n" curr_message += "For %i removements no protein from this organism remained in the sample\n" % len(no_org_remains) for pair in no_org_remains: curr_message += "%s\t%s\n" % (pair[0], pair[1]) self.host.log_tab.write_to_log(curr_message, True) print (" [..DONE..] Total %i removes done (%i possible)" % (r, len(ids_to_remove.keys())))
def align(self): if self.seq_input_frame.text_is_empty(): # No sequences were provided self.host.set_status("No sequences were provided to align!", "#FF0000") return print (" Alignment construction started...") (muscle_name, muscle_path) = Settings.get_program_name(self.host.settings.muscle_dir, "muscle") unaligned_filename = os.path.join(self.host.settings.work_dir, "%s.fasta" % self.host.temp_name) Aln_basic.write_widget_into_file(self.seq_input_frame.text_widget, unaligned_filename) aligned_filename = os.path.join(self.host.settings.work_dir, "%s.aln" % self.host.temp_name) self.host.set_status("Alignment") maxiters_option = "" if self.maxiters.get() != "": try: maxiters_option = "-maxiters %i" % int(self.maxiters.get()) except TypeError: print ("Option -maxiters is not an integer and is ignored!") gapopen_option = "" if self.gapopen.get() != "": if Aln_basic.is_negative_float(self.gapopen.get(), "-gapopen"): gapopen_option = "-gapopen %s" % self.gapopen.get() gapextend_option = "" if self.gapextend.get() != "": if Aln_basic.is_negative_float(self.gapextend.get(), "-gapextend"): gapextend_option = "-gapextend %s" % self.gapextend.get() muscle_command = "%s -in %s -out %s %s %s %s" % (muscle_path, unaligned_filename, aligned_filename, maxiters_option, gapopen_option, gapextend_option) print ("Muscle command to be ran:") print (muscle_command) if self.host.verbose.get(): os.system(muscle_command) else: os.system("%s 1> nul 2> nul" % muscle_command) Aln_basic.read_widget_from_file(self.aln_input_frame.text_widget, aligned_filename) if self.insert_blocks.get(): # Empty sequence >BLOCKS should be added curr_seqs = Aln_basic.read_fasta_from_strings(self.aln_input_frame.get_strings()) self.aln_input_frame.text_widget.delete(1.0, tkinter.END) upd_aln_file = open(aligned_filename, "w") upd_aln_file.write(">BLOCKS\n") upd_aln_file.write("%s\n\n" % ("-" * len(curr_seqs[0].sequence))) upd_aln_file.write(">SITE\n") upd_aln_file.write("%s\n\n" % ("-" * len(curr_seqs[0].sequence))) for s in curr_seqs: s.print_fasta(upd_aln_file, 60) upd_aln_file.close() Aln_basic.read_widget_from_file(self.aln_input_frame.text_widget, aligned_filename) self.host.set_status("Ready") os.remove(unaligned_filename) os.remove(aligned_filename) print (" [..DONE..]")
def histo_to_log(self): values = list() seqs = Aln_basic.read_fasta_from_strings(self.pure.get_strings()) for s in seqs: values.append(len(s.sequence)) info_string = "Length of proteins in the alignment" begin = 0 step = 0 nsteps = 0 try: begin = int(self.begin_entry.get()) step = int(self.step_entry.get()) nsteps = int(self.nsteps_entry.get()) except ValueError: print (" [..WARNING..] Enter proper begin, step and number of steps before printing!") return self.host.log_tab.write_histogram(values, info_string, begin, step, nsteps)
def filter_seq(self): """ This method filters sequences in the input frame which have the same ID. Also could filter identical protein sequences. """ seqs = Aln_basic.read_fasta_from_strings(self.seq_input_frame.get_strings()) seq_ids_unique = dict() seqs_unique = dict() i = 0 r = 0 s = 0 seq_size = len(seqs) bad_ids = list() identical_seq_ids = dict() smooth = True while i < len(seqs): if not seqs[i].ID in seq_ids_unique: # This is normal sequence seq_ids_unique[seqs[i].ID] = seqs[i].sequence else: if seqs[i].sequence != seq_ids_unique[seqs[i].ID]: # Sequences differs smooth = False bad_ids.append(seqs[i].ID) else: seqs.pop(i) i -= 1 r += 1 if not seqs[i].sequence in seqs_unique: seqs_unique[seqs[i].sequence] = True else: s += 1 identical_seq_ids[seqs[i].ID] = True i += 1 if len(identical_seq_ids) != 0: answer = tkMessageBox.askyesno("Filter identical sequences?", "We found %i sequences which are identical with some other sequence in alignment. Do you want to remove them?" % len(identical_seq_ids), icon = "question", parent = self) if answer == True: i = 0 while i < len(seqs): if seqs[i].ID in identical_seq_ids: seqs.pop(i) i -= 1 r += 1 i += 1 self.seq_input_frame.text_widget.delete(1.0, tkinter.END) for s in seqs: self.seq_input_frame.text_widget.insert(tkinter.END, ">%s\n%s\n\n" % (s.name, s.sequence)) curr_message = "Filtering of %i input sequences; %i sequences removed; %i remained\n" % (seq_size, r, len(seqs)) if smooth: self.host.set_status("Filtering gained success; %i sequences removed!" % r, self.host.header) curr_message += "Filtering gained success, all non-unique IDs removed\n" else: self.host.set_status("Filtering NOT gained success; see console for details!") print ("These sequences have duplicate IDs '%s' but different sequences; NOT removed:") curr_message += "Filtering NOT gained success, NOT all non-unique IDs removed; these remains:" for bad in bad_ids: print (bad) curr_message += "%s\n" % bad self.host.log_tab.write_to_log(curr_message, True)
def purify(self): #seqs = Aln_basic.read_fasta_from_strings(self.input_tab.aln_input_frame.get_strings()) seqs = Aln_basic.read_fasta_from_strings( self.parse_tab.fixed.get_strings()) if len(seqs) == 0: self.set_status("No alignment to purify!") return import udav_base self.set_status("Working") # --------------------------------------- 1) Calculating cut for mainly gappy parts of alignment max_name_length = 50 separator = " : " presence_threshold = 50 try: presence_threshold = int(self.purify_tab.presence_entry.get()) except: print("Using default presence threshold value = 50!") (valid_start, valid_end) = Aln_basic.get_valid_alignment_range( seqs, presence_threshold) self.purify_tab.alignment.add_label_data("showing a region [%i; %i]" % (valid_start + 1, valid_end)) self.purify_tab.alignment.text_widget.delete(1.0, tkinter.END) # --------------------------------------- 2) Printing alignment into the text widget tab self.set_status("Printing alignment", "#FF0000") seqs_cut = list( ) # List of sequences with the mainly gappy parts of alignment cut id_to_org_and_seq = dict( ) # Hash of protein ids to a tuple of (0) their organism name and (1) cut sequences id_list = list() # List of protein ids in order of their occurence for i in range(len(seqs)): # Printing to the widget fit_name = seqs[i].name[0:max_name_length] if len(fit_name) < max_name_length: fit_name += (max_name_length - len(fit_name)) * " " seq_part = seqs[i].sequence[valid_start:valid_end] seqs_cut.append(udav_base.Sequence(seqs[i].name, seq_part)) string = fit_name + ("%s%s" % (separator, seq_part)) self.purify_tab.alignment.text_widget.insert( tkinter.END, "%s\n" % string) # Saving data if seqs[i].ID in id_to_org_and_seq: print( " [..WARNING..] Non-unique ID '%s' detected; purification may work unproperly!" % seqs[i].ID) curr_org_name = seqs[i].name.replace(seqs[i].ID, "") if re.match("^[^\|]+\|[^\|]+\|[^\|]+$", seqs[i].name): curr_org_name = seqs[i].name.split("|")[2] elif re.match("^[^\|]+\|[^\|]+$", seqs[i].name): curr_org_name = seqs[i].name.split("|")[1] id_to_org_and_seq[seqs[i].ID] = (curr_org_name, seqs[i].sequence) id_list.append(seqs[i].ID) self.purify_tab.id_to_org_and_seq = id_to_org_and_seq self.purify_tab.id_list = id_list self.purify_tab.seqs_cut = seqs_cut self.purify_tab.id_to_features = None self.purify_tab.featured_sequences = None self.purify_tab.valid_start = valid_start self.purify_tab.valid_end = valid_end self.purify_tab.name_length = max_name_length + len(separator) self.purify_tab.activate_buttons() #FIX: version 0.2.8 (self-hits data is set to default) self.purify_tab.evalue_threshold.delete(0, tkinter.END) self.purify_tab.evalue_threshold.configure(state=tkinter.DISABLED) self.purify_tab.sigma_num_self.delete(0, tkinter.END) self.purify_tab.sigma_num_self.configure(state=tkinter.DISABLED) del udav_base self.set_status("Ready")