예제 #1
0
    def get_codon_alignment_from_files(self,
                                       protein_aln_file,
                                       nucleotide_seq_file,
                                       codon_alignment_file,
                                       cds2protein_accordance_file=None,
                                       alignment_format="fasta",
                                       nucleotide_sequence_format="fasta",
                                       cds_index_file=None,
                                       retain_cds_index=False):
        protein_aln_dict = AlignIO.read(protein_aln_file,
                                        format=alignment_format)
        nucleotide_seq_dict = SeqIO.index_db(
            cds_index_file if cds_index_file else "nuc_tmp.idx",
            nucleotide_seq_file,
            format=nucleotide_sequence_format)

        protein2cds_accordance_dict = None
        if cds2protein_accordance_file:
            protein2cds_accordance_dict = SynDict()
            protein2cds_accordance_dict.read(cds2protein_accordance_file,
                                             key_index=1,
                                             value_index=0)

        self.get_codon_alignment(
            protein_aln_dict,
            nucleotide_seq_dict,
            codon_alignment_file,
            protein2cds_accordance_dict=protein2cds_accordance_dict)
        if (not cds_index_file) and (not retain_cds_index):
            os.remove("nuc_tmp.idx")
예제 #2
0
    def get_families_from_top_hits(top_hits_file, fam_file):

        hit_dict = SynDict()
        hit_dict.read(top_hits_file, allow_repeats_of_key=True, key_index=1, value_index=0, comments_prefix="#")
        hit_dict.write(fam_file, splited_values=True)

        return hit_dict
예제 #3
0
 def extract_dom_names_hits_from_domtblout(domtblout_file, output_file):
     hits_dict = SynDict()
     hits_dict.read(domtblout_file, header=False, separator=None, allow_repeats_of_key=True,
                    key_index=3, value_index=0, comments_prefix="#")
     if output_file:
         hits_dict.write(output_file, splited_values=True)
     return hits_dict
예제 #4
0
    def extract_sequences_from_selected_clusters(
            self,
            clusters_id_file,
            cluster_file,
            seq_file,
            output_dir="./",
            seq_format="fasta",
            out_prefix=None,
            create_dir_for_each_cluster=False,
            skip_cluster_if_no_sequence_for_element=True):
        from Routines import SequenceRoutines
        cluster_id_list = IdList()
        cluster_dict = SynDict()
        #print(pep_file)
        self.safe_mkdir(output_dir)
        out_dir = self.check_path(output_dir)
        create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster
        if clusters_id_file:
            cluster_id_list.read(clusters_id_file)
        cluster_dict.read(cluster_file,
                          split_values=True,
                          values_separator=",")
        protein_dict = SeqIO.index_db(
            "tmp.idx",
            self.make_list_of_path_to_files(seq_file),
            format=seq_format)

        number_of_skipped_clusters = 0
        for fam_id in cluster_id_list if clusters_id_file else cluster_dict:

            if skip_cluster_if_no_sequence_for_element:
                absent_elements = self.check_absence_of_cluster_elements(
                    cluster_dict[fam_id], protein_dict)
                if absent_elements:
                    print "Skipping cluster %s due to absent element(%s)" % (
                        fam_id, ",".join(absent_elements))
                    number_of_skipped_clusters += 1
                    continue

            if fam_id in cluster_dict:
                if create_directory_for_each_cluster:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    self.safe_mkdir(fam_dir)
                    out_file = "%s%s.fasta" % (fam_dir, out_prefix
                                               if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.fasta" % (out_dir, out_prefix
                                                if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, cluster_dict[fam_id], verbose=True),
                            out_file,
                            format=seq_format)

        os.remove("tmp.idx")
        print "%i of %i clusters were skipped due to absent elements" % (
            number_of_skipped_clusters, len(cluster_dict))

        return number_of_skipped_clusters
예제 #5
0
 def extract_hits_from_tbl_output(blast_hits, output_file):
     hits = SynDict()
     hits.read(blast_hits,
               allow_repeats_of_key=True,
               key_index=0,
               value_index=1,
               separator="\t")
     hits.write(output_file,
                splited_values=True,
                separator="\t",
                values_separator=",")
     return hits
예제 #6
0
    def rename_elements_in_clusters(
            clusters_file,
            syn_file,
            output_clusters_file,
            remove_clusters_with_not_renamed_elements=False,
            elements_with_absent_synonyms_file=None,
            syn_file_key_column_index=0,
            syn_file_value_column_index=1,
            syn_file_column_separator='\t'):
        syn_dict = SynDict()
        syn_dict.read(syn_file,
                      comments_prefix="#",
                      key_index=syn_file_key_column_index,
                      value_index=syn_file_value_column_index,
                      separator=syn_file_column_separator)

        clusters_dict = SynDict()
        clusters_dict.read(clusters_file,
                           split_values=True,
                           values_separator=",",
                           comments_prefix="#")

        output_clusters_dict = SynDict()

        absent_elements_dict = SynDict()

        for cluster in clusters_dict:
            renamed_element_list = []
            all_elements_were_renamed_flag = True
            for element in clusters_dict[cluster]:
                if element in syn_dict:
                    renamed_element_list.append(syn_dict[element])
                else:
                    if cluster not in absent_elements_dict:
                        absent_elements_dict[cluster] = [element]
                    else:
                        absent_elements_dict[cluster].append(element)
                    all_elements_were_renamed_flag = False
                    renamed_element_list.append(element)

            if (not remove_clusters_with_not_renamed_elements) or (
                    remove_clusters_with_not_renamed_elements
                    and all_elements_were_renamed_flag):
                output_clusters_dict[cluster] = renamed_element_list

        output_clusters_dict.write(output_clusters_file, splited_values=True)

        if elements_with_absent_synonyms_file:
            absent_elements_dict.write(elements_with_absent_synonyms_file,
                                       splited_values=True)

        return absent_elements_dict
예제 #7
0
 def add_len_to_simple_output(top_hits_simple, len_file, out_file):
     len_dict = SynDict()
     len_dict.read(len_file)
     with open(top_hits_simple, "r") as in_fd:
         with open(out_file, "w") as out_fd:
             for line in in_fd:
                 tmp_list = line.strip().split("\t")
                 out_fd.write(
                     "%s\t%s\t%s\t%s\t%s\t%f\n" %
                     (tmp_list[0], len_dict[tmp_list[0]], tmp_list[3],
                      tmp_list[1], tmp_list[2],
                      (float(tmp_list[2]) - float(tmp_list[1]) + 1) /
                      float(len_dict[tmp_list[0]])))
예제 #8
0
 def replace_region_names_in_gff(input_gff, synonyms_file, output_gff):
     syn_dict = SynDict()
     syn_dict.read(synonyms_file, comments_prefix="#")
     with open(input_gff, "r") as in_fd:
         with open(output_gff, "w") as out_fd:
             for line in in_fd:
                 if line[0] == "#":
                     out_fd.write(line)
                 else:
                     line_list = line.split("\t")
                     if line_list[0] in syn_dict:
                         line_list[0] = syn_dict[line_list[0]]
                         out_fd.write("\t".join(line_list))
                     else:
                         out_fd.write(line)
예제 #9
0
    def label_cluster_elements_from_file(self,
                                         input_file,
                                         label,
                                         output_file,
                                         separator="@",
                                         label_position="first"):
        input_dict = SynDict()
        input_dict.read(input_file, split_values=True, comments_prefix="#")

        output_dict = self.label_cluster_elements(
            input_dict,
            label,
            separator=separator,
            label_position=label_position)
        output_dict.write(output_file, splited_values=True)

        return output_dict
예제 #10
0
    def extract_clusters_by_element_ids_from_file(self,
                                                  cluster_file,
                                                  element_file,
                                                  output_file,
                                                  mode="w"):
        """"
        mode: "w" - if elements from element_id_list are present in cluster extracts only that elements
              "a" - if elements from element_id_list are present in cluster extracts all elements
        """
        cluster_dict = SynDict()
        cluster_dict.read(cluster_file, split_values=True, comments_prefix="#")

        element_id_list = IdList()
        element_id_list.read(element_file, comments_prefix="#")
        extracted_clusters = self.extract_clusters_by_element_ids(
            cluster_dict, element_id_list, mode=mode)
        extracted_clusters.write(output_file, splited_values=True)
예제 #11
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from Routines import SequenceRoutines, FileRoutines
        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
예제 #12
0
    def add_length_to_fam_file(fam_file,
                               len_file,
                               out_file,
                               close_after_if_file_object=False):
        fam_dict = SynDict()
        fam_dict.read(fam_file, split_values=True, comments_prefix="#")
        len_dict = SynDict()
        len_dict.read(len_file, comments_prefix="#")

        out_fd = out_file if isinstance(out_file, file) else open(
            out_file, "r")

        for family in fam_dict:
            len_list = []
            for member in fam_dict[family]:
                len_list.append(None if member not in
                                len_dict else len_dict[member])

            out_fd.write(
                "%s\t%s\t%s\n" %
                (family, ",".join(fam_dict[family]), ",".join(len_list)))

        if close_after_if_file_object:
            out_fd.close()
예제 #13
0
    def replace_augustus_ids_by_syn(augustus_gff,
                                    output_gff,
                                    genes_syn_file,
                                    transcripts_syn_file,
                                    cds_syn_file=None):

        genes_syn_dict = SynDict()
        genes_syn_dict.read(genes_syn_file, comments_prefix="#")
        transcripts_syn_dict = SynDict()
        transcripts_syn_dict.read(transcripts_syn_file, comments_prefix="#")
        cds_syn_dict = SynDict()
        if cds_syn_file:
            cds_syn_dict.read(cds_syn_file, comments_prefix="#")
        with open(augustus_gff, "r") as in_fd:
            with open(output_gff, "w") as out_fd:
                for line in in_fd:
                    tmp = line.strip()
                    if len(tmp) < 13:
                        out_fd.write(line)
                        continue
                    if tmp[:12] != "# start gene":
                        out_fd.write(line)
                        continue
                    augustus_gene_id = tmp.split(" ")[-1]
                    gene_syn_id = genes_syn_dict[augustus_gene_id]
                    augustus_transcript_id = ""
                    augustus_transcript_parent = ""
                    out_fd.write("# start gene %s\n" % gene_syn_id)
                    tmp = in_fd.next().strip()
                    while True:
                        while tmp[0] != "#":
                            tmp_list = tmp.split("\t")
                            feature_type = tmp_list[2]
                            edited_str = "\t".join(tmp_list[:-1])
                            info_field_list = tmp_list[-1].split(";")
                            if feature_type == "gene":
                                edited_str += "\tID=%s\n" % gene_syn_id
                            elif feature_type == "transcript":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_transcript_id = entry.split(
                                            "=")[-1]
                                        transcript_syn_id = transcripts_syn_dict[
                                            augustus_transcript_id]
                                    if "Parent" in entry:
                                        augustus_transcript_parent = entry.split(
                                            "=")[-1]
                                        if augustus_transcript_parent != augustus_gene_id:
                                            raise ValueError(
                                                "Transcript parent id and gene id are not same!"
                                            )
                                edited_str += "\tID=%s;Parent=%s\n" % (
                                    transcript_syn_id, gene_syn_id)
                            elif feature_type == "CDS":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_cds_id = entry.split("=")[-1]
                                        cds_syn_id = cds_syn_dict[
                                            augustus_cds_id] if cds_syn_dict else "%s.cds" % transcripts_syn_dict[
                                                augustus_cds_id[:-4]]
                                    if "Parent" in entry:
                                        augustus_cds_parent = entry.split(
                                            "=")[-1]
                                        if augustus_cds_parent != augustus_transcript_id:
                                            raise ValueError(
                                                "CDS parent id and transcript id are not same!"
                                            )
                                edited_str += "\tID=%s;Parent=%s\n" % (
                                    cds_syn_id, transcript_syn_id)
                            elif (feature_type
                                  == "stop_codon") or (feature_type
                                                       == "start_codon"):
                                for entry in info_field_list:
                                    if "Parent" in entry:
                                        augustus_feature_parent = entry.split(
                                            "=")[-1]
                                        if augustus_feature_parent != augustus_transcript_id:
                                            raise ValueError(
                                                "Feature parent id and transcript id are not same!"
                                            )
                                edited_str += "\tParent=%s\n" % (
                                    transcript_syn_id)
                            else:
                                edited_str = tmp

                            out_fd.write(edited_str)
                            tmp = in_fd.next().strip()
                        while tmp[0] == "#":
                            if "# end gene" in tmp:
                                break
                            out_fd.write(tmp + "\n")
                            tmp = in_fd.next().strip()
                        if "# end gene" in tmp:
                            break
                    out_fd.write("# end gene %s\n" % gene_syn_id)
예제 #14
0
                                                   pep_uniq_description_file)
remove_isoform_versions_str = "sed s/isoform.*// %s > %s" % (
    pep_uniq_description_file, pep_uniq_description_no_isoform_versions)

for exe_string in get_pep_decription_str, get_uniq_descriptions_str, remove_isoform_versions_str:
    print(exe_string)
    os.system(exe_string)

os.system(awk_extract_ids_string % (pep_uniq_description_file, pep_uniq_ids))

syn_dict = SynDict()
syn_dict.read(pep_uniq_description_no_isoform_versions,
              header=False,
              separator="\t",
              allow_repeats_of_key=True,
              split_values=True,
              values_separator=",",
              key_index=1,
              value_index=0,
              comments_prefix="#")
syn_dict.write(pep_description_collapsed_isoforms,
               splited_values=True,
               values_separator=",")

length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input,
                                                         format="fasta",
                                                         out_file=len_file)

descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w")
descr_longest_isoform_fd = open(pep_description_longest_isoform, "w")
descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w")
예제 #15
0
    help=
    "Remove nucleotide substitutions from output(preserve only AA substitutions)"
)
parser.add_argument("-c",
                    "--convert_aa_to_single_letter",
                    action="store_true",
                    dest="convert_to_single_letter",
                    help="Convert aminoacids to single letters")

args = parser.parse_args()

args.input = make_list_of_path_to_files(args.input)

gene_alias_dict = SynDict()
if args.gene_alias_file:
    gene_alias_dict.read(args.gene_alias_file, split_values=False)
out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

summary_dict = TwoLvlDict()
for filename in args.input:
    directory, prefix, extension = split_filename(filename)

    if args.write_dir_path and args.write_ext:
        name = filename
    elif args.write_dir_path:
        name = (directory + prefix) if directory else prefix
    elif args.write_ext:
        name = prefix + extension
    else:
        name = prefix
        if args.suffix_to_remove in name: