Esempio n. 1
0
    def get_transcript_to_pep_accordance_from_gtf(gtf_file,
                                                  output_file,
                                                  comment_symbol="#"):
        """
        Tested on gtf files from Ensembl relealese 70
        """
        accordance_dict = SynDict()
        with open(gtf_file, "r") as gtf_fd:
            for line in gtf_fd:
                if line[0] == comment_symbol:
                    continue
                tmp_list = line.strip().split("\t")
                tmp_list = tmp_list[-1].split(";")
                protein_id = None
                transcript_id = None
                #print tmp_list
                for entry in tmp_list:
                    tmp_entry = entry.split()

                    if len(tmp_entry) != 2:
                        continue
                    if tmp_entry[0] == "transcript_id":
                        #print "tttt"
                        transcript_id = tmp_entry[1][1:-1]  # remove quotes
                    elif tmp_entry[0] == "protein_id":
                        #print "ppppp"
                        protein_id = tmp_entry[1][1:-1]

                if (transcript_id is not None) and (protein_id is not None):
                    if transcript_id in accordance_dict:
                        accordance_dict[transcript_id].add(protein_id)
                    else:
                        accordance_dict[transcript_id] = {protein_id}
        accordance_dict.write(output_file, splited_values=True)
Esempio n. 2
0
    def get_codon_alignment_from_files(self,
                                       protein_aln_file,
                                       nucleotide_seq_file,
                                       codon_alignment_file,
                                       cds2protein_accordance_file=None,
                                       alignment_format="fasta",
                                       nucleotide_sequence_format="fasta",
                                       cds_index_file=None,
                                       retain_cds_index=False):
        protein_aln_dict = AlignIO.read(protein_aln_file,
                                        format=alignment_format)
        nucleotide_seq_dict = SeqIO.index_db(
            cds_index_file if cds_index_file else "nuc_tmp.idx",
            nucleotide_seq_file,
            format=nucleotide_sequence_format)

        protein2cds_accordance_dict = None
        if cds2protein_accordance_file:
            protein2cds_accordance_dict = SynDict()
            protein2cds_accordance_dict.read(cds2protein_accordance_file,
                                             key_index=1,
                                             value_index=0)

        self.get_codon_alignment(
            protein_aln_dict,
            nucleotide_seq_dict,
            codon_alignment_file,
            protein2cds_accordance_dict=protein2cds_accordance_dict)
        if (not cds_index_file) and (not retain_cds_index):
            os.remove("nuc_tmp.idx")
Esempio n. 3
0
    def extract_sequences_from_selected_clusters(
            self,
            clusters_id_file,
            cluster_file,
            seq_file,
            output_dir="./",
            seq_format="fasta",
            out_prefix=None,
            create_dir_for_each_cluster=False,
            skip_cluster_if_no_sequence_for_element=True):
        from Routines import SequenceRoutines
        cluster_id_list = IdList()
        cluster_dict = SynDict()
        #print(pep_file)
        self.safe_mkdir(output_dir)
        out_dir = self.check_path(output_dir)
        create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster
        if clusters_id_file:
            cluster_id_list.read(clusters_id_file)
        cluster_dict.read(cluster_file,
                          split_values=True,
                          values_separator=",")
        protein_dict = SeqIO.index_db(
            "tmp.idx",
            self.make_list_of_path_to_files(seq_file),
            format=seq_format)

        number_of_skipped_clusters = 0
        for fam_id in cluster_id_list if clusters_id_file else cluster_dict:

            if skip_cluster_if_no_sequence_for_element:
                absent_elements = self.check_absence_of_cluster_elements(
                    cluster_dict[fam_id], protein_dict)
                if absent_elements:
                    print "Skipping cluster %s due to absent element(%s)" % (
                        fam_id, ",".join(absent_elements))
                    number_of_skipped_clusters += 1
                    continue

            if fam_id in cluster_dict:
                if create_directory_for_each_cluster:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    self.safe_mkdir(fam_dir)
                    out_file = "%s%s.fasta" % (fam_dir, out_prefix
                                               if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.fasta" % (out_dir, out_prefix
                                                if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, cluster_dict[fam_id], verbose=True),
                            out_file,
                            format=seq_format)

        os.remove("tmp.idx")
        print "%i of %i clusters were skipped due to absent elements" % (
            number_of_skipped_clusters, len(cluster_dict))

        return number_of_skipped_clusters
Esempio n. 4
0
 def add_len_to_simple_output(top_hits_simple, len_file, out_file):
     len_dict = SynDict()
     len_dict.read(len_file)
     with open(top_hits_simple, "r") as in_fd:
         with open(out_file, "w") as out_fd:
             for line in in_fd:
                 tmp_list = line.strip().split("\t")
                 out_fd.write(
                     "%s\t%s\t%s\t%s\t%s\t%f\n" %
                     (tmp_list[0], len_dict[tmp_list[0]], tmp_list[3],
                      tmp_list[1], tmp_list[2],
                      (float(tmp_list[2]) - float(tmp_list[1]) + 1) /
                      float(len_dict[tmp_list[0]])))
Esempio n. 5
0
    def count_unique_positions_per_sequence_from_file(self,
                                                      alignment_file,
                                                      output_prefix,
                                                      format="fasta",
                                                      gap_symbol="-",
                                                      return_mode="absolute",
                                                      verbose=True):

        alignment = AlignIO.read(alignment_file, format=format)
        number_of_sequences = len(alignment)
        alignment_length = len(alignment[0])
        position_presence_matrix = self.get_position_presence_matrix(
            alignment, gap_symbol=gap_symbol, verbose=verbose)
        unique_position_count_dict = SynDict()
        unique_position_count_percent_dict = SynDict()

        for row in range(0, number_of_sequences):
            sequence_id = alignment[row].id
            unique_positions = 0
            for column in range(0, alignment_length):
                if (position_presence_matrix[row, column]
                        == 1) or (position_presence_matrix[row, column] == -1):
                    unique_positions += 1

            unique_position_count_dict[sequence_id] = unique_positions
            unique_position_count_percent_dict[sequence_id] = 100 * float(
                unique_positions) / (alignment_length -
                                     str(alignment[row].seq).count(gap_symbol))

        unique_position_count_dict.write("%s.absolute_counts" % output_prefix)
        unique_position_count_percent_dict.write("%s.percent_counts" %
                                                 output_prefix)

        return unique_position_count_dict if return_mode == "absolute" else unique_position_count_percent_dict
Esempio n. 6
0
 def replace_region_names_in_gff(input_gff, synonyms_file, output_gff):
     syn_dict = SynDict()
     syn_dict.read(synonyms_file, comments_prefix="#")
     with open(input_gff, "r") as in_fd:
         with open(output_gff, "w") as out_fd:
             for line in in_fd:
                 if line[0] == "#":
                     out_fd.write(line)
                 else:
                     line_list = line.split("\t")
                     if line_list[0] in syn_dict:
                         line_list[0] = syn_dict[line_list[0]]
                         out_fd.write("\t".join(line_list))
                     else:
                         out_fd.write(line)
Esempio n. 7
0
    def combine_count_files(count_file_list,
                            output_file,
                            sample_name_list=None):

        if sample_name_list is not None:
            if len(count_file_list) != len(sample_name_list):
                raise ValueError(
                    "Several files doesn't have corresponding sample name")

        samples = zip(
            sample_name_list if sample_name_list else count_file_list,
            count_file_list)

        count_table = TwoLvlDict()

        for sample, filename in samples:
            count_table[sample] = SynDict(filename=filename,
                                          header=False,
                                          separator="\t",
                                          allow_repeats_of_key=False,
                                          split_values=False,
                                          values_separator=",",
                                          key_index=0,
                                          value_index=1,
                                          close_after_if_file_object=False,
                                          expression=None,
                                          comments_prefix="__")

        count_table.write(output_file)
Esempio n. 8
0
    def merge_clusters(clusters_dict,
                       label_species="False",
                       separator_for_labeling="_",
                       species_label_first=True):

        if species_label_first:
            label_sequence = lambda label, name: "%s%s%s" % (
                label, separator_for_labeling, name)
        else:
            label_sequence = lambda label, name: "%s%s%s" % (
                name, separator_for_labeling, label)
        if label_species:
            expression = label_sequence
        else:
            expression = lambda label, name: name

        merged_clusters = SynDict()
        for species in clusters_dict:
            for cluster in clusters_dict[species]:
                if cluster not in merged_clusters:
                    merged_clusters[cluster] = []
                for sequence_name in clusters_dict[species][cluster]:
                    merged_clusters[cluster].append(
                        expression(species, sequence_name))

        return merged_clusters
Esempio n. 9
0
    def split_proteins_per_species(self, dir_with_proteins, output_dir, input_format="fasta", output_format="fasta"):
        #print type(FileRoutines)
        input_files = self.make_list_of_path_to_files([dir_with_proteins] if isinstance(dir_with_proteins, str) else dir_with_proteins)

        out_dir = self.check_path(output_dir)
        self.safe_mkdir(out_dir)

        protein_dict = SeqIO.index_db("temp.idx", input_files, format=input_format)

        syn_dict = SynDict()

        for protein_id in protein_dict:
            taxa_id = protein_id.split(".")[0]
            # pep_id = ".".join(tmp_list[1:])
            if taxa_id not in syn_dict:
                syn_dict[taxa_id] = []
            syn_dict[taxa_id].append(protein_id)

        def renamed_records_generator(record_dict, taxa_id):
            for record_id in syn_dict[taxa_id]:
                record = deepcopy(record_dict[record_id])
                #print(record)
                record.id = ".".join(record_id.split(".")[1:])
                yield record

        for taxa_id in syn_dict:
            out_file = "%s%s.pep" % (out_dir, taxa_id)
            SeqIO.write(renamed_records_generator(protein_dict, taxa_id), out_file, format=output_format)
Esempio n. 10
0
    def replace_label(cluster_dict,
                      syn_dict=None,
                      old_separator="@",
                      old_label_position="first",
                      new_separator="@",
                      new_label_position="first"):
        new_cluster_dict = SynDict()
        for cluster in cluster_dict:
            new_cluster_dict[cluster] = []
            for element in cluster_dict[cluster]:
                tmp = element.split(old_separator)
                if old_label_position == "first":
                    label = tmp[0]
                    element_id = old_separator.join(tmp[1:])
                else:
                    label = tmp[-1]
                    element_id = old_separator.join(tmp[:-1])

                if new_label_position == 'first':
                    new_cluster_dict[cluster].append(
                        "%s%s%s" % (syn_dict[label] if syn_dict else label,
                                    new_separator, element_id))
                else:
                    new_cluster_dict[cluster].append(
                        "%s%s%s" % (element_id, new_separator,
                                    syn_dict[label] if syn_dict else label))

        return new_cluster_dict
Esempio n. 11
0
    def extract_single_copy_clusters_from_files(
            self,
            list_of_cluster_files,
            output_file,
            label_elements=False,
            separator="@",
            label_position="first",
            function_to_convert_filename_to_label=None):
        dict_of_cluster_dicts = OrderedDict()
        for filename in list_of_cluster_files:
            if function_to_convert_filename_to_label:
                label = function_to_convert_filename_to_label(filename)
            else:
                label = self.split_filename(filename)[
                    1]  # use basename as label

            dict_of_cluster_dicts[label] = SynDict()
            dict_of_cluster_dicts[label].read(filename,
                                              split_values=True,
                                              comments_prefix="#")

        sc_clusters_dict = self.extract_single_copy_clusters(
            dict_of_cluster_dicts,
            label_elements=label_elements,
            separator=separator,
            label_position=label_position)

        sc_clusters_dict.write(output_file, splited_values=True)

        return sc_clusters_dict
Esempio n. 12
0
    def label_cluster_elements_from_file(self,
                                         input_file,
                                         label,
                                         output_file,
                                         separator="@",
                                         label_position="first"):
        input_dict = SynDict()
        input_dict.read(input_file, split_values=True, comments_prefix="#")

        output_dict = self.label_cluster_elements(
            input_dict,
            label,
            separator=separator,
            label_position=label_position)
        output_dict.write(output_file, splited_values=True)

        return output_dict
Esempio n. 13
0
    def extract_clusters_by_element_ids_from_file(self,
                                                  cluster_file,
                                                  element_file,
                                                  output_file,
                                                  mode="w"):
        """"
        mode: "w" - if elements from element_id_list are present in cluster extracts only that elements
              "a" - if elements from element_id_list are present in cluster extracts all elements
        """
        cluster_dict = SynDict()
        cluster_dict.read(cluster_file, split_values=True, comments_prefix="#")

        element_id_list = IdList()
        element_id_list.read(element_file, comments_prefix="#")
        extracted_clusters = self.extract_clusters_by_element_ids(
            cluster_dict, element_id_list, mode=mode)
        extracted_clusters.write(output_file, splited_values=True)
Esempio n. 14
0
    def prepare_annotation_file_from_transcript_and_cds(
            self,
            transcript_file,
            cds_file,
            correspondence_file,
            output_prefix,
            format="fasta",
            correspondence_key_column=0,
            correspondence_value_column=1,
            verbose=False):
        transcript_dict = self.parse_seq_file(transcript_file,
                                              "parse",
                                              format=format)

        cds_dict = self.parse_seq_file(cds_file, "parse", format=format)

        correspondence_dict = SynDict(filename=correspondence_file,
                                      comments_prefix="#",
                                      key_index=correspondence_key_column,
                                      value_index=correspondence_value_column)

        no_corresponding_cds_transcript_list = IdList()
        cds_not_found_transcript_list = IdList()

        annotation_file = "%s.annotation" % output_prefix
        no_corresponding_cds_transcript_file = "%s.no_cds.id" % output_prefix
        cds_not_found_transcript_file = "%s.not_found_cds.id" % output_prefix

        with open(annotation_file, "w") as annotation_fd:
            for transcript_id in transcript_dict:
                if transcript_id not in correspondence_dict:
                    no_corresponding_cds_transcript_list.append(transcript_id)
                    if verbose:
                        print(
                            "No cds in correspondence file for transcript %s" %
                            transcript_id)
                    continue
                cds_id = correspondence_dict[transcript_id]
                length = len(cds_dict[cds_id].seq)
                start = transcript_dict[transcript_id].seq.upper().find(
                    cds_dict[cds_id].seq.upper())
                if start == -1:
                    cds_not_found_transcript_list.append(transcript_id)
                    if verbose:
                        print("CDS was not found for transcript %s" %
                              transcript_id)
                    continue
                annotation_string = "%s\t+\t%i\t%i\n" % (transcript_id,
                                                         start + 1, length)

                annotation_fd.write(annotation_string)

        no_corresponding_cds_transcript_list.write(
            no_corresponding_cds_transcript_file)
        cds_not_found_transcript_list.write(cds_not_found_transcript_file)
Esempio n. 15
0
    def get_families_from_top_hits(top_hits_file, fam_file):

        hit_dict = SynDict()
        hit_dict.read(top_hits_file, allow_repeats_of_key=True, key_index=1, value_index=0, comments_prefix="#")
        hit_dict.write(fam_file, splited_values=True)

        return hit_dict
Esempio n. 16
0
 def extract_dom_names_hits_from_domtblout(domtblout_file, output_file):
     hits_dict = SynDict()
     hits_dict.read(domtblout_file, header=False, separator=None, allow_repeats_of_key=True,
                    key_index=3, value_index=0, comments_prefix="#")
     if output_file:
         hits_dict.write(output_file, splited_values=True)
     return hits_dict
Esempio n. 17
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from Routines import SequenceRoutines, FileRoutines
        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
Esempio n. 18
0
    def replace_label_from_file(self,
                                input_file,
                                output_file,
                                syn_file_or_dict,
                                old_separator="@",
                                old_label_position="first",
                                new_separator="@",
                                new_label_position="first"):

        syn_dict = SynDict(
            filename=syn_file_or_dict, split_values=False) if isinstance(
                syn_file_or_dict, str) else SynDict(syn_file_or_dict)
        cluster_dict = SynDict(filename=input_file, split_values=True)
        new_cluster_dict = self.replace_label(
            cluster_dict,
            syn_dict=syn_dict,
            old_separator=old_separator,
            old_label_position=old_label_position,
            new_separator=new_separator,
            new_label_position=new_label_position)

        new_cluster_dict.write(output_file, splited_values=True)

        return new_cluster_dict
Esempio n. 19
0
    def label_cluster_elements(cluster_dict,
                               label,
                               separator="@",
                               label_position="first"):
        labeled_cluster_dict = SynDict()
        if label_position == "first":
            label_function = lambda s: "%s%s%s" % (label, separator, s)
        elif label_position == "last":
            label_function = lambda s: "%s%s%s" % (s, separator, label)

        for cluster in cluster_dict:
            labeled_cluster_dict[cluster] = []
            for element in cluster_dict[cluster]:
                labeled_cluster_dict[cluster].append(label_function(element))

        return labeled_cluster_dict
Esempio n. 20
0
    def replace_column_value_by_syn(input_file,
                                    syn_file,
                                    out_file,
                                    column=0,
                                    comment_prefix=None,
                                    separator="\t",
                                    syn_header=False,
                                    syn_separator="\t",
                                    syn_key_index=0,
                                    syn_value_index=1,
                                    syn_comment_prefix=None):
        syn_dict = SynDict(filename=syn_file,
                           header=syn_header,
                           separator=syn_separator,
                           key_index=syn_key_index,
                           value_index=syn_value_index,
                           comments_prefix=syn_comment_prefix)
        if comment_prefix:
            comment_prefix_len = len(comment_prefix)
        line_number = 0
        replaced = 0
        not_replaced = 0
        with open(input_file, "r") as in_fd:
            with open(out_file, "w") as out_fd:
                for line in in_fd:
                    line_number += 1
                    if comment_prefix:
                        if line[0:comment_prefix_len] == comment_prefix:
                            out_fd.write(line)
                            continue
                    line_list = line.strip("\n").split(separator)
                    if len(line_list) < column + 1:
                        sys.stderr.write(
                            "WARNING!!! Line %i doesn't have column %i\n" %
                            (line_number, column))
                    if line_list[column] in syn_dict:
                        replaced += 1
                        line_list[column] = syn_dict[line_list[column]]
                    else:
                        not_replaced += 1

                    out_fd.write(separator.join(line_list))
                    out_fd.write("\n")

        sys.stderr.write("Replaced: %i\nNot replaced: %i\n" %
                         (replaced, not_replaced))
Esempio n. 21
0
 def extract_hits_from_tbl_output(blast_hits, output_file):
     hits = SynDict()
     hits.read(blast_hits,
               allow_repeats_of_key=True,
               key_index=0,
               value_index=1,
               separator="\t")
     hits.write(output_file,
                splited_values=True,
                separator="\t",
                values_separator=",")
     return hits
Esempio n. 22
0
 def read_cluster_files_from_dir(self, dir_with_cluster_files):
     cluster_files_list = sorted(os.listdir(dir_with_cluster_files))
     clusters_dict = OrderedDict()
     for filename in cluster_files_list:
         filepath = "%s%s" % (self.check_path(dir_with_cluster_files),
                              filename)
         filename_list = self.split_filename(filepath)
         clusters_dict[filename_list[1]] = SynDict()
         clusters_dict[filename_list[1]].read(filepath,
                                              header=False,
                                              separator="\t",
                                              allow_repeats_of_key=False,
                                              split_values=True,
                                              values_separator=",",
                                              key_index=0,
                                              value_index=1,
                                              comments_prefix="#")
     return clusters_dict
Esempio n. 23
0
 def get_sequence_names(clusters_dict,
                        write_ids=False,
                        out_prefix=None,
                        white_list_ids=None):
     sequence_names_dict = SynDict()
     for species in clusters_dict:
         sequence_names_dict[species] = IdSet()
     for species in clusters_dict:
         for cluster_id in clusters_dict[species]:
             if white_list_ids:
                 if cluster_id not in white_list_ids:
                     continue
             sequence_names_dict[species] = sequence_names_dict[
                 species] | IdSet(clusters_dict[species][cluster_id])
     if write_ids:
         for species in clusters_dict:
             out_file = "%s_%s.ids" % (
                 out_prefix, species) if out_prefix else "%s.ids" % species
             sequence_names_dict[species].write(out_file)
     return sequence_names_dict
Esempio n. 24
0
    def get_species_from_eggnog_tsv(self, eggnog_tsv, output_prefix, email=None):

        cluster_dict = SynDict(filename=eggnog_tsv, key_index=1, value_index=5, split_values=True)

        species_ids = self.extract_labels_from_cluster_elements(cluster_dict, separator=".", label_position="first")
        print("Input species ids: %s" % " ".join(species_ids))

        if not email:
            species = species_ids
        else:
            species = NCBIRoutines.get_taxonomy(species_ids, "%s.species.taxonomy" % output_prefix,
                                                email, input_type="id")

        species.write("%s.species" % output_prefix, splited_values=True)

        for species_id in species:
            for i in range(0, len(species[species_id])):
                species[species_id][i] = species[species_id][i].lower().replace(" ", "_")

        species.write("%s.replaced_spaces.species" % output_prefix, splited_values=True)
Esempio n. 25
0
    def add_length_to_fam_file(fam_file,
                               len_file,
                               out_file,
                               close_after_if_file_object=False):
        fam_dict = SynDict()
        fam_dict.read(fam_file, split_values=True, comments_prefix="#")
        len_dict = SynDict()
        len_dict.read(len_file, comments_prefix="#")

        out_fd = out_file if isinstance(out_file, file) else open(
            out_file, "r")

        for family in fam_dict:
            len_list = []
            for member in fam_dict[family]:
                len_list.append(None if member not in
                                len_dict else len_dict[member])

            out_fd.write(
                "%s\t%s\t%s\n" %
                (family, ",".join(fam_dict[family]), ",".join(len_list)))

        if close_after_if_file_object:
            out_fd.close()
Esempio n. 26
0
    def extract_single_copy_clusters(dict_of_cluster_dicts,
                                     label_elements=False,
                                     separator="@",
                                     label_position="first"):

        if label_position == "first":
            label_function = lambda s, label: "%s%s%s" % (label, separator, s)
        elif label_position == "last":
            label_function = lambda s, label: "%s%s%s" % (s, separator, label)

        sc_clusters_dict = SynDict()

        clusters_set = set()
        for group in dict_of_cluster_dicts:
            clusters_set = clusters_set | set(
                dict_of_cluster_dicts[group].keys())

        for cluster in clusters_set:
            for group in dict_of_cluster_dicts:
                if cluster not in dict_of_cluster_dicts[group]:
                    break
                if len(dict_of_cluster_dicts[group][cluster]) > 1:
                    break
            else:
                sc_clusters_dict[cluster] = []
                for group in dict_of_cluster_dicts:
                    if label_elements:
                        sc_clusters_dict[cluster].append(
                            label_function(
                                dict_of_cluster_dicts[group][cluster][0],
                                group))
                    else:
                        sc_clusters_dict[cluster].append(
                            dict_of_cluster_dicts[group][cluster][0])

        return sc_clusters_dict
Esempio n. 27
0
    def extract_clusters_by_element_ids(cluster_dict,
                                        element_id_list,
                                        mode="w"):
        """"
        mode: "w" - if elements from element_id_list are present in cluster extracts only that elements
              "a" - if elements from element_id_list are present in cluster extracts all elements
        """

        extracted_clusters = SynDict()
        for cluster in cluster_dict:
            extracted_elements = []
            if mode == "w":
                for element in cluster_dict[cluster]:
                    if element in element_id_list:
                        extracted_elements.append(element)
                if extracted_elements:
                    extracted_clusters[cluster] = extracted_elements
            elif mode == "a":
                for element in cluster_dict[cluster]:
                    if element in element_id_list:
                        extracted_clusters[cluster] = cluster_dict[cluster]
                        break

        return extracted_clusters
Esempio n. 28
0
    def replace_augustus_ids_by_syn(augustus_gff,
                                    output_gff,
                                    genes_syn_file,
                                    transcripts_syn_file,
                                    cds_syn_file=None):

        genes_syn_dict = SynDict()
        genes_syn_dict.read(genes_syn_file, comments_prefix="#")
        transcripts_syn_dict = SynDict()
        transcripts_syn_dict.read(transcripts_syn_file, comments_prefix="#")
        cds_syn_dict = SynDict()
        if cds_syn_file:
            cds_syn_dict.read(cds_syn_file, comments_prefix="#")
        with open(augustus_gff, "r") as in_fd:
            with open(output_gff, "w") as out_fd:
                for line in in_fd:
                    tmp = line.strip()
                    if len(tmp) < 13:
                        out_fd.write(line)
                        continue
                    if tmp[:12] != "# start gene":
                        out_fd.write(line)
                        continue
                    augustus_gene_id = tmp.split(" ")[-1]
                    gene_syn_id = genes_syn_dict[augustus_gene_id]
                    augustus_transcript_id = ""
                    augustus_transcript_parent = ""
                    out_fd.write("# start gene %s\n" % gene_syn_id)
                    tmp = in_fd.next().strip()
                    while True:
                        while tmp[0] != "#":
                            tmp_list = tmp.split("\t")
                            feature_type = tmp_list[2]
                            edited_str = "\t".join(tmp_list[:-1])
                            info_field_list = tmp_list[-1].split(";")
                            if feature_type == "gene":
                                edited_str += "\tID=%s\n" % gene_syn_id
                            elif feature_type == "transcript":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_transcript_id = entry.split(
                                            "=")[-1]
                                        transcript_syn_id = transcripts_syn_dict[
                                            augustus_transcript_id]
                                    if "Parent" in entry:
                                        augustus_transcript_parent = entry.split(
                                            "=")[-1]
                                        if augustus_transcript_parent != augustus_gene_id:
                                            raise ValueError(
                                                "Transcript parent id and gene id are not same!"
                                            )
                                edited_str += "\tID=%s;Parent=%s\n" % (
                                    transcript_syn_id, gene_syn_id)
                            elif feature_type == "CDS":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_cds_id = entry.split("=")[-1]
                                        cds_syn_id = cds_syn_dict[
                                            augustus_cds_id] if cds_syn_dict else "%s.cds" % transcripts_syn_dict[
                                                augustus_cds_id[:-4]]
                                    if "Parent" in entry:
                                        augustus_cds_parent = entry.split(
                                            "=")[-1]
                                        if augustus_cds_parent != augustus_transcript_id:
                                            raise ValueError(
                                                "CDS parent id and transcript id are not same!"
                                            )
                                edited_str += "\tID=%s;Parent=%s\n" % (
                                    cds_syn_id, transcript_syn_id)
                            elif (feature_type
                                  == "stop_codon") or (feature_type
                                                       == "start_codon"):
                                for entry in info_field_list:
                                    if "Parent" in entry:
                                        augustus_feature_parent = entry.split(
                                            "=")[-1]
                                        if augustus_feature_parent != augustus_transcript_id:
                                            raise ValueError(
                                                "Feature parent id and transcript id are not same!"
                                            )
                                edited_str += "\tParent=%s\n" % (
                                    transcript_syn_id)
                            else:
                                edited_str = tmp

                            out_fd.write(edited_str)
                            tmp = in_fd.next().strip()
                        while tmp[0] == "#":
                            if "# end gene" in tmp:
                                break
                            out_fd.write(tmp + "\n")
                            tmp = in_fd.next().strip()
                        if "# end gene" in tmp:
                            break
                    out_fd.write("# end gene %s\n" % gene_syn_id)
Esempio n. 29
0
    def replace_augustus_ids(augustus_gff,
                             output_prefix,
                             species_prefix=None,
                             number_of_digits_in_id=8):

        output_gff = "%s.renamed.gff" % output_prefix
        genes_syn_file = "%s.gene.syn" % output_prefix
        transcripts_syn_file = "%s.transcript.syn" % output_prefix
        cds_syn_file = "%s.cds.syn" % output_prefix
        genes_syn_dict = SynDict()
        transcripts_syn_dict = SynDict()
        cds_syn_dict = SynDict()
        gene_counter = 0
        gene_id_template = "%sG%%0%ii" % (species_prefix,
                                          number_of_digits_in_id)
        transcripts_counter = 0
        transcript_id_template = "%sT%%0%ii" % (species_prefix,
                                                number_of_digits_in_id)
        cds_counter = 0
        cds_id_template = "%sC%%0%ii" % (species_prefix,
                                         number_of_digits_in_id)
        with open(augustus_gff, "r") as in_fd:
            with open(output_gff, "w") as out_fd:
                for line in in_fd:
                    tmp = line.strip()
                    if len(tmp) < 13:
                        out_fd.write(line)
                        continue
                    if tmp[:12] != "# start gene":
                        out_fd.write(line)
                        continue
                    augustus_gene_id = tmp.split(" ")[-1]
                    gene_counter += 1

                    gene_syn_id = gene_id_template % gene_counter
                    genes_syn_dict[augustus_gene_id] = gene_syn_id
                    augustus_transcript_id = ""
                    augustus_transcript_parent = ""
                    out_fd.write("# start gene %s\n" % gene_syn_id)
                    tmp = in_fd.next().strip()
                    while True:
                        while tmp[0] != "#":
                            tmp_list = tmp.split("\t")
                            feature_type = tmp_list[2]
                            edited_str = "\t".join(tmp_list[:-1])
                            info_field_list = tmp_list[-1].split(";")
                            if feature_type == "gene":
                                edited_str += "\tID=%s\n" % gene_syn_id
                            elif feature_type == "transcript":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_transcript_id = entry.split(
                                            "=")[-1]
                                        if augustus_transcript_id not in transcripts_syn_dict:
                                            transcripts_counter += 1
                                            transcripts_syn_dict[
                                                augustus_transcript_id] = transcript_id_template % transcripts_counter
                                        transcript_syn_id = transcripts_syn_dict[
                                            augustus_transcript_id]
                                    if "Parent" in entry:
                                        augustus_transcript_parent = entry.split(
                                            "=")[-1]
                                        if augustus_transcript_parent != augustus_gene_id:
                                            raise ValueError(
                                                "Transcript parent id and gene id are not same!"
                                            )
                                edited_str += "\tID=%s;Parent=%s\n" % (
                                    transcript_syn_id, gene_syn_id)
                            elif feature_type == "CDS":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_cds_id = entry.split("=")[-1]
                                        if augustus_cds_id not in cds_syn_dict:
                                            cds_counter += 1
                                            cds_syn_dict[
                                                augustus_cds_id] = cds_id_template % cds_counter
                                        cds_syn_id = cds_syn_dict[
                                            augustus_cds_id]
                                    if "Parent" in entry:
                                        augustus_cds_parent = entry.split(
                                            "=")[-1]
                                        if augustus_cds_parent != augustus_transcript_id:
                                            raise ValueError(
                                                "CDS parent id and transcript id are not same!"
                                            )
                                edited_str += "\tID=%s;Parent=%s\n" % (
                                    cds_syn_id, transcript_syn_id)
                            elif (feature_type
                                  == "stop_codon") or (feature_type
                                                       == "start_codon"):
                                for entry in info_field_list:
                                    if "Parent" in entry:
                                        augustus_feature_parent = entry.split(
                                            "=")[-1]
                                        if augustus_feature_parent != augustus_transcript_id:
                                            raise ValueError(
                                                "Feature parent id and transcript id are not same!"
                                            )
                                edited_str += "\tParent=%s\n" % transcript_syn_id
                            else:
                                edited_str = tmp + "\n"

                            out_fd.write(edited_str)
                            tmp = in_fd.next().strip()
                        while tmp[0] == "#":
                            if "# end gene" in tmp:
                                break
                            out_fd.write(tmp + "\n")
                            tmp = in_fd.next().strip()
                        if "# end gene" in tmp:
                            break
                    out_fd.write("# end gene %s\n" % gene_syn_id)
        genes_syn_dict.write(genes_syn_file)
        transcripts_syn_dict.write(transcripts_syn_file)
        cds_syn_dict.write(cds_syn_file)
Esempio n. 30
0
    def rename_elements_in_clusters(
            clusters_file,
            syn_file,
            output_clusters_file,
            remove_clusters_with_not_renamed_elements=False,
            elements_with_absent_synonyms_file=None,
            syn_file_key_column_index=0,
            syn_file_value_column_index=1,
            syn_file_column_separator='\t'):
        syn_dict = SynDict()
        syn_dict.read(syn_file,
                      comments_prefix="#",
                      key_index=syn_file_key_column_index,
                      value_index=syn_file_value_column_index,
                      separator=syn_file_column_separator)

        clusters_dict = SynDict()
        clusters_dict.read(clusters_file,
                           split_values=True,
                           values_separator=",",
                           comments_prefix="#")

        output_clusters_dict = SynDict()

        absent_elements_dict = SynDict()

        for cluster in clusters_dict:
            renamed_element_list = []
            all_elements_were_renamed_flag = True
            for element in clusters_dict[cluster]:
                if element in syn_dict:
                    renamed_element_list.append(syn_dict[element])
                else:
                    if cluster not in absent_elements_dict:
                        absent_elements_dict[cluster] = [element]
                    else:
                        absent_elements_dict[cluster].append(element)
                    all_elements_were_renamed_flag = False
                    renamed_element_list.append(element)

            if (not remove_clusters_with_not_renamed_elements) or (
                    remove_clusters_with_not_renamed_elements
                    and all_elements_were_renamed_flag):
                output_clusters_dict[cluster] = renamed_element_list

        output_clusters_dict.write(output_clusters_file, splited_values=True)

        if elements_with_absent_synonyms_file:
            absent_elements_dict.write(elements_with_absent_synonyms_file,
                                       splited_values=True)

        return absent_elements_dict