def get_codon_alignment_from_files(self, protein_aln_file, nucleotide_seq_file, codon_alignment_file, cds2protein_accordance_file=None, alignment_format="fasta", nucleotide_sequence_format="fasta", cds_index_file=None, retain_cds_index=False): protein_aln_dict = AlignIO.read(protein_aln_file, format=alignment_format) nucleotide_seq_dict = SeqIO.index_db( cds_index_file if cds_index_file else "nuc_tmp.idx", nucleotide_seq_file, format=nucleotide_sequence_format) protein2cds_accordance_dict = None if cds2protein_accordance_file: protein2cds_accordance_dict = SynDict() protein2cds_accordance_dict.read(cds2protein_accordance_file, key_index=1, value_index=0) self.get_codon_alignment( protein_aln_dict, nucleotide_seq_dict, codon_alignment_file, protein2cds_accordance_dict=protein2cds_accordance_dict) if (not cds_index_file) and (not retain_cds_index): os.remove("nuc_tmp.idx")
def get_families_from_top_hits(top_hits_file, fam_file): hit_dict = SynDict() hit_dict.read(top_hits_file, allow_repeats_of_key=True, key_index=1, value_index=0, comments_prefix="#") hit_dict.write(fam_file, splited_values=True) return hit_dict
def extract_dom_names_hits_from_domtblout(domtblout_file, output_file): hits_dict = SynDict() hits_dict.read(domtblout_file, header=False, separator=None, allow_repeats_of_key=True, key_index=3, value_index=0, comments_prefix="#") if output_file: hits_dict.write(output_file, splited_values=True) return hits_dict
def extract_sequences_from_selected_clusters( self, clusters_id_file, cluster_file, seq_file, output_dir="./", seq_format="fasta", out_prefix=None, create_dir_for_each_cluster=False, skip_cluster_if_no_sequence_for_element=True): from Routines import SequenceRoutines cluster_id_list = IdList() cluster_dict = SynDict() #print(pep_file) self.safe_mkdir(output_dir) out_dir = self.check_path(output_dir) create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster if clusters_id_file: cluster_id_list.read(clusters_id_file) cluster_dict.read(cluster_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db( "tmp.idx", self.make_list_of_path_to_files(seq_file), format=seq_format) number_of_skipped_clusters = 0 for fam_id in cluster_id_list if clusters_id_file else cluster_dict: if skip_cluster_if_no_sequence_for_element: absent_elements = self.check_absence_of_cluster_elements( cluster_dict[fam_id], protein_dict) if absent_elements: print "Skipping cluster %s due to absent element(%s)" % ( fam_id, ",".join(absent_elements)) number_of_skipped_clusters += 1 continue if fam_id in cluster_dict: if create_directory_for_each_cluster: fam_dir = "%s%s/" % (out_dir, fam_id) self.safe_mkdir(fam_dir) out_file = "%s%s.fasta" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.fasta" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, cluster_dict[fam_id], verbose=True), out_file, format=seq_format) os.remove("tmp.idx") print "%i of %i clusters were skipped due to absent elements" % ( number_of_skipped_clusters, len(cluster_dict)) return number_of_skipped_clusters
def extract_hits_from_tbl_output(blast_hits, output_file): hits = SynDict() hits.read(blast_hits, allow_repeats_of_key=True, key_index=0, value_index=1, separator="\t") hits.write(output_file, splited_values=True, separator="\t", values_separator=",") return hits
def rename_elements_in_clusters( clusters_file, syn_file, output_clusters_file, remove_clusters_with_not_renamed_elements=False, elements_with_absent_synonyms_file=None, syn_file_key_column_index=0, syn_file_value_column_index=1, syn_file_column_separator='\t'): syn_dict = SynDict() syn_dict.read(syn_file, comments_prefix="#", key_index=syn_file_key_column_index, value_index=syn_file_value_column_index, separator=syn_file_column_separator) clusters_dict = SynDict() clusters_dict.read(clusters_file, split_values=True, values_separator=",", comments_prefix="#") output_clusters_dict = SynDict() absent_elements_dict = SynDict() for cluster in clusters_dict: renamed_element_list = [] all_elements_were_renamed_flag = True for element in clusters_dict[cluster]: if element in syn_dict: renamed_element_list.append(syn_dict[element]) else: if cluster not in absent_elements_dict: absent_elements_dict[cluster] = [element] else: absent_elements_dict[cluster].append(element) all_elements_were_renamed_flag = False renamed_element_list.append(element) if (not remove_clusters_with_not_renamed_elements) or ( remove_clusters_with_not_renamed_elements and all_elements_were_renamed_flag): output_clusters_dict[cluster] = renamed_element_list output_clusters_dict.write(output_clusters_file, splited_values=True) if elements_with_absent_synonyms_file: absent_elements_dict.write(elements_with_absent_synonyms_file, splited_values=True) return absent_elements_dict
def add_len_to_simple_output(top_hits_simple, len_file, out_file): len_dict = SynDict() len_dict.read(len_file) with open(top_hits_simple, "r") as in_fd: with open(out_file, "w") as out_fd: for line in in_fd: tmp_list = line.strip().split("\t") out_fd.write( "%s\t%s\t%s\t%s\t%s\t%f\n" % (tmp_list[0], len_dict[tmp_list[0]], tmp_list[3], tmp_list[1], tmp_list[2], (float(tmp_list[2]) - float(tmp_list[1]) + 1) / float(len_dict[tmp_list[0]])))
def replace_region_names_in_gff(input_gff, synonyms_file, output_gff): syn_dict = SynDict() syn_dict.read(synonyms_file, comments_prefix="#") with open(input_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) else: line_list = line.split("\t") if line_list[0] in syn_dict: line_list[0] = syn_dict[line_list[0]] out_fd.write("\t".join(line_list)) else: out_fd.write(line)
def label_cluster_elements_from_file(self, input_file, label, output_file, separator="@", label_position="first"): input_dict = SynDict() input_dict.read(input_file, split_values=True, comments_prefix="#") output_dict = self.label_cluster_elements( input_dict, label, separator=separator, label_position=label_position) output_dict.write(output_file, splited_values=True) return output_dict
def extract_clusters_by_element_ids_from_file(self, cluster_file, element_file, output_file, mode="w"): """" mode: "w" - if elements from element_id_list are present in cluster extracts only that elements "a" - if elements from element_id_list are present in cluster extracts all elements """ cluster_dict = SynDict() cluster_dict.read(cluster_file, split_values=True, comments_prefix="#") element_id_list = IdList() element_id_list.read(element_file, comments_prefix="#") extracted_clusters = self.extract_clusters_by_element_ids( cluster_dict, element_id_list, mode=mode) extracted_clusters.write(output_file, splited_values=True)
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from Routines import SequenceRoutines, FileRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
def add_length_to_fam_file(fam_file, len_file, out_file, close_after_if_file_object=False): fam_dict = SynDict() fam_dict.read(fam_file, split_values=True, comments_prefix="#") len_dict = SynDict() len_dict.read(len_file, comments_prefix="#") out_fd = out_file if isinstance(out_file, file) else open( out_file, "r") for family in fam_dict: len_list = [] for member in fam_dict[family]: len_list.append(None if member not in len_dict else len_dict[member]) out_fd.write( "%s\t%s\t%s\n" % (family, ",".join(fam_dict[family]), ",".join(len_list))) if close_after_if_file_object: out_fd.close()
def replace_augustus_ids_by_syn(augustus_gff, output_gff, genes_syn_file, transcripts_syn_file, cds_syn_file=None): genes_syn_dict = SynDict() genes_syn_dict.read(genes_syn_file, comments_prefix="#") transcripts_syn_dict = SynDict() transcripts_syn_dict.read(transcripts_syn_file, comments_prefix="#") cds_syn_dict = SynDict() if cds_syn_file: cds_syn_dict.read(cds_syn_file, comments_prefix="#") with open(augustus_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: tmp = line.strip() if len(tmp) < 13: out_fd.write(line) continue if tmp[:12] != "# start gene": out_fd.write(line) continue augustus_gene_id = tmp.split(" ")[-1] gene_syn_id = genes_syn_dict[augustus_gene_id] augustus_transcript_id = "" augustus_transcript_parent = "" out_fd.write("# start gene %s\n" % gene_syn_id) tmp = in_fd.next().strip() while True: while tmp[0] != "#": tmp_list = tmp.split("\t") feature_type = tmp_list[2] edited_str = "\t".join(tmp_list[:-1]) info_field_list = tmp_list[-1].split(";") if feature_type == "gene": edited_str += "\tID=%s\n" % gene_syn_id elif feature_type == "transcript": for entry in info_field_list: if "ID" in entry: augustus_transcript_id = entry.split( "=")[-1] transcript_syn_id = transcripts_syn_dict[ augustus_transcript_id] if "Parent" in entry: augustus_transcript_parent = entry.split( "=")[-1] if augustus_transcript_parent != augustus_gene_id: raise ValueError( "Transcript parent id and gene id are not same!" ) edited_str += "\tID=%s;Parent=%s\n" % ( transcript_syn_id, gene_syn_id) elif feature_type == "CDS": for entry in info_field_list: if "ID" in entry: augustus_cds_id = entry.split("=")[-1] cds_syn_id = cds_syn_dict[ augustus_cds_id] if cds_syn_dict else "%s.cds" % transcripts_syn_dict[ augustus_cds_id[:-4]] if "Parent" in entry: augustus_cds_parent = entry.split( "=")[-1] if augustus_cds_parent != augustus_transcript_id: raise ValueError( "CDS parent id and transcript id are not same!" ) edited_str += "\tID=%s;Parent=%s\n" % ( cds_syn_id, transcript_syn_id) elif (feature_type == "stop_codon") or (feature_type == "start_codon"): for entry in info_field_list: if "Parent" in entry: augustus_feature_parent = entry.split( "=")[-1] if augustus_feature_parent != augustus_transcript_id: raise ValueError( "Feature parent id and transcript id are not same!" ) edited_str += "\tParent=%s\n" % ( transcript_syn_id) else: edited_str = tmp out_fd.write(edited_str) tmp = in_fd.next().strip() while tmp[0] == "#": if "# end gene" in tmp: break out_fd.write(tmp + "\n") tmp = in_fd.next().strip() if "# end gene" in tmp: break out_fd.write("# end gene %s\n" % gene_syn_id)
pep_uniq_description_file) remove_isoform_versions_str = "sed s/isoform.*// %s > %s" % ( pep_uniq_description_file, pep_uniq_description_no_isoform_versions) for exe_string in get_pep_decription_str, get_uniq_descriptions_str, remove_isoform_versions_str: print(exe_string) os.system(exe_string) os.system(awk_extract_ids_string % (pep_uniq_description_file, pep_uniq_ids)) syn_dict = SynDict() syn_dict.read(pep_uniq_description_no_isoform_versions, header=False, separator="\t", allow_repeats_of_key=True, split_values=True, values_separator=",", key_index=1, value_index=0, comments_prefix="#") syn_dict.write(pep_description_collapsed_isoforms, splited_values=True, values_separator=",") length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input, format="fasta", out_file=len_file) descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w") descr_longest_isoform_fd = open(pep_description_longest_isoform, "w") descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w")
help= "Remove nucleotide substitutions from output(preserve only AA substitutions)" ) parser.add_argument("-c", "--convert_aa_to_single_letter", action="store_true", dest="convert_to_single_letter", help="Convert aminoacids to single letters") args = parser.parse_args() args.input = make_list_of_path_to_files(args.input) gene_alias_dict = SynDict() if args.gene_alias_file: gene_alias_dict.read(args.gene_alias_file, split_values=False) out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") summary_dict = TwoLvlDict() for filename in args.input: directory, prefix, extension = split_filename(filename) if args.write_dir_path and args.write_ext: name = filename elif args.write_dir_path: name = (directory + prefix) if directory else prefix elif args.write_ext: name = prefix + extension else: name = prefix if args.suffix_to_remove in name: