def correct_regions_from_gff(
            self,
            reference,
            variants_vcf,
            gff_file,
            output_prefix=None,
            feature_type_list=["CDS"],
            unification_key="Parent",
            #raw_seq_per_line=False,
            vcf_with_masking=None,
            override_vcf_by_mask=None,
            use_ambiguous_nuccleotides=None):

        feature_dict = AnnotationsRoutines.get_feature_dict(
            gff_file,
            output_prefix=output_prefix,
            feature_type_list=feature_type_list,
            unification_key=unification_key)
        region_file = "%s.coordinates_only.list" % output_prefix

        raw_regions = "%s.raw.seq" % output_prefix
        final_regions = "%s.fasta" % output_prefix

        regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix

        self.correct_reference(
            reference,
            raw_regions,
            variants_vcf,
            raw_seq_per_line=True,
            vcf_with_masking=vcf_with_masking,
            override_vcf_by_mask=override_vcf_by_mask,
            use_ambiguous_nuccleotides=use_ambiguous_nuccleotides,
            interval_list=region_file)

        region_with_frameshift = SynDict()

        def new_regions_generator():
            with open(raw_regions, "r") as in_fd:
                for region_id in feature_dict:
                    seq = ""
                    for i in range(0, len(feature_dict[region_id])):
                        seq_fragment = in_fd.readline().strip()
                        if ((int(feature_dict[region_id][i][2]) -
                             int(feature_dict[region_id][i][1]) + 1) -
                                len(seq_fragment)) % 3 != 0:
                            if region_id not in region_with_frameshift:
                                region_with_frameshift[region_id] = [i]
                            else:
                                region_with_frameshift[region_id].append(i)
                        seq += seq_fragment
                    yield SeqRecord(
                        seq=Seq(seq) if feature_dict[region_id][0][3] == "+"
                        else Seq(seq).reverse_complement(),
                        id=region_id,
                        description="")

        SeqIO.write(new_regions_generator(), final_regions, format="fasta")
        region_with_frameshift.write(regions_with_frameshift_file,
                                     splited_values=True)
Ejemplo n.º 2
0
    def get_transcript_to_pep_accordance_from_gtf(gtf_file,
                                                  output_file,
                                                  comment_symbol="#"):
        """
        Tested on gtf files from Ensembl relealese 70
        """
        accordance_dict = SynDict()
        with open(gtf_file, "r") as gtf_fd:
            for line in gtf_fd:
                if line[0] == comment_symbol:
                    continue
                tmp_list = line.strip().split("\t")
                tmp_list = tmp_list[-1].split(";")
                protein_id = None
                transcript_id = None
                #print tmp_list
                for entry in tmp_list:
                    tmp_entry = entry.split()

                    if len(tmp_entry) != 2:
                        continue
                    if tmp_entry[0] == "transcript_id":
                        #print ("tttt")
                        transcript_id = tmp_entry[1][1:-1]  # remove quotes
                    elif tmp_entry[0] == "protein_id":
                        #print ("ppppp")
                        protein_id = tmp_entry[1][1:-1]

                if (transcript_id is not None) and (protein_id is not None):
                    if transcript_id in accordance_dict:
                        accordance_dict[transcript_id].add(protein_id)
                    else:
                        accordance_dict[transcript_id] = {protein_id}
        accordance_dict.write(output_file, splited_values=True)
Ejemplo n.º 3
0
    def count_per_scaffold_feature_number(gff_file,
                                          out_file=None,
                                          feature_type_list=[]):
        feature_count_dict = SynDict()

        if feature_type_list:

            def check_feature_type(feature_type):
                return feature_type in feature_type_list
        else:

            def check_feature_type(feature_type):
                return True

        with open(gff_file, "r") as gff_fd:
            for line in gff_fd:
                if line[0] == "#":
                    continue
                line_list = line.split("\t")
                if check_feature_type(line_list[2]):
                    if line_list[0] in feature_count_dict:
                        feature_count_dict[line_list[0]] += 1
                    else:
                        feature_count_dict[line_list[0]] = 1

        if out_file:
            feature_count_dict.write(out_file)

        return feature_count_dict
Ejemplo n.º 4
0
    def rename_elements_in_clusters(
            clusters_file,
            syn_file,
            output_clusters_file,
            remove_clusters_with_not_renamed_elements=False,
            elements_with_absent_synonyms_file=None,
            syn_file_key_column_index=0,
            syn_file_value_column_index=1,
            syn_file_column_separator='\t',
            keep_only_unique_elements=False):
        syn_dict = SynDict()
        syn_dict.read(syn_file,
                      comments_prefix="#",
                      key_index=syn_file_key_column_index,
                      value_index=syn_file_value_column_index,
                      separator=syn_file_column_separator)

        clusters_dict = SynDict()
        clusters_dict.read(clusters_file,
                           split_values=True,
                           values_separator=",",
                           comments_prefix="#")

        output_clusters_dict = SynDict()

        absent_elements_dict = SynDict()

        for cluster in clusters_dict:
            renamed_element_list = []
            all_elements_were_renamed_flag = True
            for element in clusters_dict[cluster]:
                if element in syn_dict:
                    renamed_element_list.append(syn_dict[element])
                else:
                    if cluster not in absent_elements_dict:
                        absent_elements_dict[cluster] = [element]
                    else:
                        absent_elements_dict[cluster].append(element)
                    all_elements_were_renamed_flag = False
                    renamed_element_list.append(element)

            if (not remove_clusters_with_not_renamed_elements) or (
                    remove_clusters_with_not_renamed_elements
                    and all_elements_were_renamed_flag):
                output_clusters_dict[cluster] = set(
                    renamed_element_list
                ) if keep_only_unique_elements else renamed_element_list

        output_clusters_dict.write(output_clusters_file, splited_values=True)

        if elements_with_absent_synonyms_file:
            absent_elements_dict.write(elements_with_absent_synonyms_file,
                                       splited_values=True)

        return absent_elements_dict
Ejemplo n.º 5
0
    def get_monomer_len_file_from_trf_gff(trf_gff, len_file):
        len_dict = SynDict()

        with open(trf_gff, "r") as trf_fd:
            for line in trf_fd:
                if line[0] == "#":
                    continue
                description_dict = AnnotationsRoutines.get_description_dict_from_gff_string(line)
                len_dict[description_dict["ID"]] = description_dict["Period"]
        # print len_dict
        len_dict.write(len_file)
Ejemplo n.º 6
0
    def get_families_from_top_hits(top_hits_file, fam_file):

        hit_dict = SynDict()
        hit_dict.read(top_hits_file,
                      allow_repeats_of_key=True,
                      key_index=1,
                      value_index=0,
                      comments_prefix="#")
        hit_dict.write(fam_file, splited_values=True)

        return hit_dict
Ejemplo n.º 7
0
 def extract_dom_names_hits_from_domtblout(domtblout_file, output_file):
     hits_dict = SynDict()
     hits_dict.read(domtblout_file,
                    header=False,
                    separator=None,
                    allow_repeats_of_key=True,
                    key_index=3,
                    value_index=0,
                    comments_prefix="#")
     if output_file:
         hits_dict.write(output_file, splited_values=True)
     return hits_dict
Ejemplo n.º 8
0
 def syn2fam(syn_file,
             fam_file,
             key_column=0,
             value_column=1,
             separator="\t"):
     syn_dict = SynDict(filename=syn_file,
                        allow_repeats_of_key=True,
                        key_index=key_column,
                        value_index=value_column,
                        separator=separator,
                        split_values=True)
     syn_dict.write(fam_file, splited_values=True)
Ejemplo n.º 9
0
    def extract_GO_terms_from_emapper_annotation_file(emapper_annotation_file,
                                                      output_file):
        GO_terms_dict = SynDict(filename=emapper_annotation_file,
                                key_index=0,
                                value_index=5,
                                split_values=True,
                                values_separator=",",
                                comments_prefix="#",
                                separator="\t")
        GO_terms_dict.header = "#protein_id\tGO_terms"
        GO_terms_dict.write(output_file, header=True, splited_values=True)

        return GO_terms_dict
Ejemplo n.º 10
0
    def extract_predicted_gene_names_from_emapper_annotation_file(
            emapper_annotation_file, output_file):
        extract_predicted_gene_names_dict = SynDict(
            filename=emapper_annotation_file,
            key_index=0,
            value_index=4,
            split_values=True,
            values_separator=",",
            comments_prefix="#",
            separator="\t")
        extract_predicted_gene_names_dict.header = "#protein_id\tpredicted_gene_name"
        extract_predicted_gene_names_dict.write(output_file,
                                                header=True,
                                                splited_values=True)

        return extract_predicted_gene_names_dict
Ejemplo n.º 11
0
    def convert_emapper_annotation_file_to_fam(emapper_annotation_file,
                                               output_fam,
                                               eggnogdb_prefix=None,
                                               species_name=None,
                                               label_separator="@",
                                               diamond_mode=False,
                                               database=None):
        fam_dict = SynDict()

        if diamond_mode and (database is not None):

            def extract_fam_from_line(line_list):
                db_dict = dict(
                    map(lambda s: s.split("@")[::-1], line_list[9].split(",")))
                return db_dict[database] if database in db_dict else "unknown"
        elif diamond_mode:
            raise ValueError(
                "ERROR!!! Database name (veNOG or other) is required in diamond mode!"
            )
        else:

            def extract_fam_from_line(line_list):
                return line_list[10].split("|")[0]

        with open(emapper_annotation_file, "r") as annotations_fd:
            for line in annotations_fd:
                if line[0] == "#":
                    continue
                line_list = line.split("\t")

                fam_id = extract_fam_from_line(line_list)
                if not (eggnogdb_prefix is None):
                    fam_id = eggnogdb_prefix + fam_id

                gene_id = "%s%s%s" % (
                    species_name, label_separator,
                    line_list[0]) if species_name else line_list[0]

                if fam_id in fam_dict:
                    fam_dict[fam_id].append(gene_id)
                else:
                    fam_dict[fam_id] = [gene_id]

        fam_dict.write(filename=output_fam, splited_values=True)
Ejemplo n.º 12
0
    def get_feature_dict(self,
                         input_gff,
                         output_prefix=None,
                         feature_type_list=["CDS"],
                         unification_key="Parent"):

        feature_dict = SynDict()
        for line_list in self.file_line_as_list_generator(input_gff,
                                                          comments_prefix="#",
                                                          separator="\t"):
            annotation_dict = self.parse_gff_annotation_string_to_dict(
                line_list[self.GFF_ATTRIBUTE_COLUMN])

            if line_list[self.GFF_FEATURETYPE_COLUMN] not in feature_type_list:
                continue

            if unification_key not in annotation_dict:
                continue
            #print unification_key
            #print(annotation_dict)

            if annotation_dict[unification_key][0] not in feature_dict:
                feature_dict[annotation_dict[unification_key][0]] = []

            feature_dict[annotation_dict[unification_key][0]].append([
                line_list[self.GFF_SCAFFOLD_COLUMN],
                line_list[self.GFF_START_COLUMN],
                line_list[self.GFF_END_COLUMN],
                line_list[self.GFF_STRAND_COLUMN]
            ])

        if output_prefix:
            feature_dict.write(
                "%s.tab" % output_prefix,
                value_expression=self.feature_list_entry_to_tab_str,
                line_per_value=True)
            feature_dict.write(
                "%s.coordinates_only.tab" % output_prefix,
                value_expression=self.feature_list_entry_to_tab_str,
                line_per_value=True,
                values_only=True)

            feature_dict.write(
                "%s.list" % output_prefix,
                value_expression=self.feature_list_entry_to_gatk_interval_str,
                line_per_value=True)
            feature_dict.write(
                "%s.coordinates_only.list" % output_prefix,
                value_expression=self.feature_list_entry_to_gatk_interval_str,
                line_per_value=True,
                values_only=True)

        return feature_dict
Ejemplo n.º 13
0
    def count_miRNA_reads(self, alignment_file, gff_file, output_prefix, annotation_file_type="GTF",
                          min_read_fraction_overlap=1.0, feature_type_to_use=None, attribute_type_to_use=None,
                          sample_name=None, stranded=1):

        no_multimapped_read_counts = "%s.no_multimapped_reads.count" % output_prefix
        with_multimapped_read_counts = "%s.with_multimapped_reads.count" % output_prefix
        all_adjusted_read_counts = "%s.all_adjusted_reads.count" % output_prefix

        self.count(alignment_file, gff_file, no_multimapped_read_counts, annotation_file_type=annotation_file_type,
                   min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use,
                   attribute_type_to_use=attribute_type_to_use, stranded=stranded)

        self.count(alignment_file, gff_file, with_multimapped_read_counts, count_multimapped_reads=True,
                   annotation_file_type=annotation_file_type,
                   min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use,
                   attribute_type_to_use=attribute_type_to_use, stranded=stranded)

        no_multimapped_read_count_dict = SynDict(filename=no_multimapped_read_counts, comments_prefix="#",
                                                 key_index=0, value_index=6, expression=int, header=True)
        with_multimapped_read_count_dict = SynDict(filename=with_multimapped_read_counts, comments_prefix="#",
                                                   key_index=0, value_index=6, expression=int, header=True)
        similar_feature_number_dict = SynDict(filename=with_multimapped_read_counts, comments_prefix="#", header=True,
                                              key_index=0, value_index=1, expression=lambda s: len(s.split(";")))

        sample_nameeee = sample_name if sample_name else similar_feature_number_dict.header.split()[6]

        all_adjusted_read_count_dict = SynDict()
        all_adjusted_read_count_dict.header = ".\t%s" % sample_nameeee

        #print no_multimapped_read_count_dict
        #print with_multimapped_read_count_dict
        #print similar_feature_number_dict

        for feature_id in no_multimapped_read_count_dict:
            all_adjusted_read_count_dict[feature_id] = int(ceil(float(no_multimapped_read_count_dict[feature_id]) + \
                                                            (float(with_multimapped_read_count_dict[feature_id]) - float(no_multimapped_read_count_dict[feature_id])) / float(similar_feature_number_dict[feature_id])))

        all_adjusted_read_count_dict.write(all_adjusted_read_counts, header=True)
Ejemplo n.º 14
0
    def count_column_values_from_file(self,
                                      input_file,
                                      column_number,
                                      output_file=None,
                                      separator="\t",
                                      comments_prefix="#",
                                      verbose=False):

        column_value_dict = SynDict()

        for line_list in self.file_line_as_list_generator(
                input_file, separator=separator,
                comments_prefix=comments_prefix):

            if line_list[column_number] in column_value_dict:
                column_value_dict[line_list[column_number]] += 1
            else:
                column_value_dict[line_list[column_number]] = 1

        if output_file:
            column_value_dict.write(output_file)

        return column_value_dict
Ejemplo n.º 15
0
    def cluster_sequence_names_by_id_fragment(self,
                                              seq_id_list,
                                              id_element_index,
                                              id_separator="_",
                                              output_prefix=None):
        cluster_dict = SynDict()
        skipped_id_list = IdList()

        for seq_id in seq_id_list:
            seq_id_splited = seq_id.split(id_separator)
            if id_element_index < len(seq_id_splited):
                if seq_id_list[id_element_index] in cluster_dict:
                    cluster_dict[seq_id_list[id_element_index]].append(seq_id)
                else:
                    cluster_dict[seq_id_list[id_element_index]] = [seq_id]
            else:
                skipped_id_list.append(seq_id)

        if output_prefix:
            cluster_dict.write("%s.seqid.clusters" % output_prefix,
                               splited_values=True)
            skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix)

        return cluster_dict
Ejemplo n.º 16
0
    def extract_clusters_by_size(cluster_dict,
                                 min_cluster_size=None,
                                 max_cluster_size=None,
                                 white_list_ids=None,
                                 out_file=None):
        filtered_cluster_dict = SynDict()

        if (min_cluster_size is not None) and (max_cluster_size is not None):

            def filt_exp(element_list):
                return True if min_cluster_size <= len(
                    element_list) <= max_cluster_size else False
        elif max_cluster_size is not None:

            def filt_exp(element_list):
                return True if len(element_list) <= max_cluster_size else False
        elif min_cluster_size is not None:

            def filt_exp(element_list):
                return True if min_cluster_size <= len(element_list) else False
        else:
            raise ValueError(
                "ERROR!!! Neither minimum nor maximum cluster size thresholds were set"
            )

        for cluster_id in cluster_dict:
            if white_list_ids and (cluster_id not in white_list_ids):
                continue

            if filt_exp(cluster_dict[cluster_id]):
                filtered_cluster_dict[cluster_id] = cluster_dict[cluster_id]

        if out_file:
            filtered_cluster_dict.write(filename=out_file, splited_values=True)

        return filtered_cluster_dict
Ejemplo n.º 17
0
    def extract_eggnog_fam_by_protein_syn_dict(self, eggnog_fam_dict, protein_syn_dict, output_prefix=None, species_id=None):

        extracted_families = SynDict()
        common_protein_names_to_families_dict = SynDict()
        common_names_to_eggnog_proteins_syn_dict = SynDict()

        not_found_proteins_common_names = IdList()

        transposed_eggnog_fam_dict = eggnog_fam_dict.exchange_key_and_value()

        for common_protein_name in protein_syn_dict:
            not_found = True
            for protein_id in protein_syn_dict[common_protein_name]:
                extended_protein_id = protein_id if species_id is None else species_id + "." + protein_id
                if extended_protein_id in transposed_eggnog_fam_dict:
                    not_found = False
                    if common_protein_name not in common_protein_names_to_families_dict:
                        common_protein_names_to_families_dict[common_protein_name] = [transposed_eggnog_fam_dict[extended_protein_id][0]]
                        common_names_to_eggnog_proteins_syn_dict[common_protein_name] = [extended_protein_id]
                    else:
                        common_protein_names_to_families_dict[common_protein_name].append(transposed_eggnog_fam_dict[extended_protein_id][0])
                        common_names_to_eggnog_proteins_syn_dict[common_protein_name].append(extended_protein_id)
                    if transposed_eggnog_fam_dict[extended_protein_id][0] not in extracted_families:
                        extracted_families[transposed_eggnog_fam_dict[extended_protein_id][0]] = eggnog_fam_dict[transposed_eggnog_fam_dict[extended_protein_id][0]]

            if not_found:
                not_found_proteins_common_names.append(common_protein_name)

        if output_prefix:
            extracted_families.write(filename="%s.extracted_families.fam" % output_prefix, splited_values=True)
            common_protein_names_to_families_dict.write(filename="%s.common_protein_names_to_families.correspondence" % output_prefix, splited_values=True)
            common_names_to_eggnog_proteins_syn_dict.write(filename="%s.common_protein_names_to_eggnog_proteins.correspondence" % output_prefix, splited_values=True)
            not_found_proteins_common_names.write(filename="%s.not_found.common_names" % output_prefix)

            #print common_names_to_eggnog_proteins_syn_dict
            #print common_protein_names_to_families_dict
        return extracted_families, common_protein_names_to_families_dict, \
               common_names_to_eggnog_proteins_syn_dict, not_found_proteins_common_names
Ejemplo n.º 18
0
    accordance_dict[species].read(accordance_file, key_index=1, value_index=0)


if args.name_first:
    def split_name(pep_name):
        gene_list = pep_name.split(args.name_separator)
        return gene_list[0], args.name_separator.join(gene_list[1:])
else:
    def split_name(pep_name):
        gene_list = pep_name.split(args.name_separator)
        return gene_list[-1], args.name_separator.join(gene_list[:-1])

families_with_errors = IdSet()
for family in pep_fam_dict:
    cds_fam_dict[family] = []
    for pep in pep_fam_dict[family]:
        species, pep_name = split_name(pep)
        if pep_name in accordance_dict[species]:
            cds_name = "%s%s%s" % (species, args.name_separator, accordance_dict[species][pep_name]) if args.name_first else \
                "%s%s%s" % (accordance_dict[species][pep_name], args.name_separator, species)
            cds_fam_dict[family].append(cds_name)
        else:
            print("%s %s %s doesn't have associated cds in accordance file" % (family, species, pep_name))
            families_with_errors.add(family)

for family in families_with_errors:
    cds_fam_dict.pop(family, None)

families_with_errors.write(args.fam_error)
cds_fam_dict.write(args.output, splited_values=True)
Ejemplo n.º 19
0
    def add_flanks_to_gff_record(self,
                                 input_gff,
                                 output_prefix,
                                 left_flank_len,
                                 right_flank_len,
                                 fasta_file,
                                 coords_description_entry="core_seq_coords",
                                 id_description_entry="ID"):
        sequence_length_dict = self.get_lengths_from_seq_file(fasta_file)
        shorter_flanks_dict = SynDict()

        output_gff = "%s.gff" % output_prefix
        short_flanks_file = "%s.short_flanks.dat" % output_prefix

        with open(input_gff, "r") as in_fd:
            with open(output_gff, "w") as out_fd:
                for line in in_fd:
                    if line[0] == "#":
                        out_fd.write(line)
                        continue
                    line_list = line.strip().split("\t")
                    scaffold = line_list[0]
                    start = int(line_list[3])
                    end = int(line_list[4])

                    record_id = OrderedDict(
                        map(lambda s: s.split("="),
                            line_list[8].split(";")))[id_description_entry]

                    line_list[8] += ";%s=%i,%i" % (coords_description_entry,
                                                   start, end)

                    if line_list[6] == "-":
                        if start - right_flank_len > 0:
                            line_list[3] = str(start - right_flank_len)
                            right_flank_length = right_flank_len
                        else:
                            right_flank_length = start - 1
                            line_list[3] = "1"

                        if end + left_flank_len <= sequence_length_dict[
                                line_list[0]]:
                            line_list[4] = str(end + left_flank_len)
                            left_flank_length = left_flank_len
                        else:
                            left_flank_length = sequence_length_dict[
                                line_list[0]] - end
                            line_list[4] = sequence_length_dict[line_list[0]]
                    else:
                        if start - left_flank_len > 0:
                            line_list[3] = str(start - left_flank_len)
                            left_flank_length = left_flank_len
                        else:
                            left_flank_length = start - 1
                            line_list[3] = "1"

                        if end + right_flank_len <= sequence_length_dict[
                                line_list[0]]:
                            line_list[4] = str(end + right_flank_len)
                            right_flank_length = right_flank_len
                        else:
                            right_flank_length = sequence_length_dict[
                                line_list[0]] - end
                            line_list[4] = str(
                                sequence_length_dict[line_list[0]])

                    if (left_flank_length < left_flank_len) or (
                            right_flank_length < right_flank_len):
                        print("%s: Short flank" % record_id)
                        shorter_flanks_dict[record_id] = "%i,%i" % (
                            left_flank_length, right_flank_length)
                    line_list[8] += ";%s_relative=%i,%i\n" % (
                        coords_description_entry, 1 +
                        (right_flank_length if line_list[6] == "-" else
                         left_flank_length), end - start + 1 +
                        (right_flank_length
                         if line_list[6] == "-" else left_flank_length))
                    """
                    print( line)
                    print( line_list)
                    for element in line_list:
                        print (element)
                        print (type(element))
                    """
                    out_fd.write("\t".join(line_list))

        shorter_flanks_dict.write(short_flanks_file)
Ejemplo n.º 20
0
sl_keys = list(complicated_families_dict.sl_keys())
for sl_key in sl_keys:
    sp_set = set()
    for species in complicated_families_dict:
        if sl_key not in complicated_families_dict[species]:
            continue
        tmp = complicated_families_dict[species][sl_key].split(";")
        for i in range(0, len(tmp)):
            if "_" in tmp[i]:
                tmp[i] = tmp[i][2:]
            tmp[i] = tmp[i].split(",")
            for syn_id in tmp[i]:
                complicated_families_syn_ids.add(syn_id)
                sp_set.add(syn_id)
    complicated_families_syn_dict[sl_key] = sp_set
complicated_families_syn_dict.write("complicated_families_connections.t",
                                    splited_values=True)

for entry in complicated_families_dict.all_values():
    tmp = entry.split(";")
    for i in range(0, len(tmp)):
        if "_" in tmp[i]:
            tmp[i] = tmp[i][2]
        tmp[i] = tmp[i].split(",")
        for syn_id in tmp[i]:
            complicated_families_syn_ids.add(syn_id)
complicated_families_syn_ids.write("complicated_families_check.ids")

nonassembled.write("splited_to_several_families.t", absent_symbol=".")

assemled_to_different_families = species_syn_dict.filter_by_line(
    filter_different_assembly)
Ejemplo n.º 21
0
parser = argparse.ArgumentParser()

parser.add_argument("-i", "--tree_dir", action="store", dest="tree_dir", required=True, type=check_path,
                    help="Directory with trees")
parser.add_argument("-f", "--tree_format", action="store", dest="tree_format", default=1, type=int,
                    help="Format of input trees")
parser.add_argument("-o", "--output_file", action="store", dest="output_file", default="stdout",
                    help="Output file with leaves of trees. Default: stdout")

args = parser.parse_args()

out_fd = sys.stdout if args.output_file == "stdout" else open(args.output_file, "w")

tree_files_list = os.listdir(args.tree_dir)

names_dict = SynDict()

for tree_file in tree_files_list:
    tree_name = split_filename(tree_file)[1]
    with open("%s%s" % (args.tree_dir, tree_file), "r") as tree_fd:
        tree = Tree(tree_fd.readline().strip(), format=args.tree_format)
    leaves_list = []
    for node in tree.traverse():
        if node.is_leaf():
            leaves_list.append(node.name)
    names_dict[tree_name] = leaves_list

names_dict.write(args.outp_fd, splited_values=True)
if args.output_file != "stdout":
    out_fd.close()
Ejemplo n.º 22
0
    def replace_augustus_ids(augustus_gff,
                             output_prefix,
                             species_prefix=None,
                             number_of_digits_in_id=8):

        output_gff = "%s.renamed.gff" % output_prefix
        genes_syn_file = "%s.gene.syn" % output_prefix
        transcripts_syn_file = "%s.transcript.syn" % output_prefix
        cds_syn_file = "%s.cds.syn" % output_prefix
        genes_syn_dict = SynDict()
        transcripts_syn_dict = SynDict()
        cds_syn_dict = SynDict()
        gene_counter = 0
        gene_id_template = "%sG%%0%ii" % (species_prefix,
                                          number_of_digits_in_id)
        transcripts_counter = 0
        transcript_id_template = "%sT%%0%ii" % (species_prefix,
                                                number_of_digits_in_id)
        cds_counter = 0
        cds_id_template = "%sC%%0%ii" % (species_prefix,
                                         number_of_digits_in_id)
        with open(augustus_gff, "r") as in_fd:
            with open(output_gff, "w") as out_fd:
                for line in in_fd:
                    tmp = line.strip()
                    if len(tmp) < 13:
                        out_fd.write(line)
                        continue
                    if tmp[:12] != "# start gene":
                        out_fd.write(line)
                        continue
                    augustus_gene_id = tmp.split(" ")[-1]
                    gene_counter += 1

                    gene_syn_id = gene_id_template % gene_counter
                    genes_syn_dict[augustus_gene_id] = gene_syn_id
                    augustus_transcript_id = ""
                    augustus_transcript_parent = ""
                    out_fd.write("# start gene %s\n" % gene_syn_id)
                    tmp = in_fd.readline().strip()
                    while True:
                        while tmp[0] != "#":
                            tmp_list = tmp.split("\t")
                            feature_type = tmp_list[2]
                            edited_str = "\t".join(tmp_list[:-1])
                            info_field_list = tmp_list[-1].split(";")
                            if feature_type == "gene":
                                edited_str += "\tID=%s\n" % gene_syn_id
                            elif feature_type == "transcript":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_transcript_id = entry.split(
                                            "=")[-1]
                                        if augustus_transcript_id not in transcripts_syn_dict:
                                            transcripts_counter += 1
                                            transcripts_syn_dict[
                                                augustus_transcript_id] = transcript_id_template % transcripts_counter
                                        transcript_syn_id = transcripts_syn_dict[
                                            augustus_transcript_id]
                                    if "Parent" in entry:
                                        augustus_transcript_parent = entry.split(
                                            "=")[-1]
                                        if augustus_transcript_parent != augustus_gene_id:
                                            raise ValueError(
                                                "Transcript parent id and gene id are not same!"
                                            )
                                edited_str += "\tID=%s;Parent=%s\n" % (
                                    transcript_syn_id, gene_syn_id)
                            elif feature_type == "CDS":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_cds_id = entry.split("=")[-1]
                                        if augustus_cds_id not in cds_syn_dict:
                                            cds_counter += 1
                                            cds_syn_dict[
                                                augustus_cds_id] = cds_id_template % cds_counter
                                        cds_syn_id = cds_syn_dict[
                                            augustus_cds_id]
                                    if "Parent" in entry:
                                        augustus_cds_parent = entry.split(
                                            "=")[-1]
                                        if augustus_cds_parent != augustus_transcript_id:
                                            raise ValueError(
                                                "CDS parent id and transcript id are not same!"
                                            )
                                edited_str += "\tID=%s;Parent=%s\n" % (
                                    cds_syn_id, transcript_syn_id)
                            elif (feature_type
                                  == "stop_codon") or (feature_type
                                                       == "start_codon"):
                                for entry in info_field_list:
                                    if "Parent" in entry:
                                        augustus_feature_parent = entry.split(
                                            "=")[-1]
                                        if augustus_feature_parent != augustus_transcript_id:
                                            raise ValueError(
                                                "Feature parent id and transcript id are not same!"
                                            )
                                edited_str += "\tParent=%s\n" % transcript_syn_id
                            else:
                                edited_str = tmp + "\n"

                            out_fd.write(edited_str)
                            tmp = in_fd.readline().strip()
                        while tmp[0] == "#":
                            if "# end gene" in tmp:
                                break
                            out_fd.write(tmp + "\n")
                            tmp = in_fd.readline().strip()
                        if "# end gene" in tmp:
                            break
                    out_fd.write("# end gene %s\n" % gene_syn_id)
        genes_syn_dict.write(genes_syn_file)
        transcripts_syn_dict.write(transcripts_syn_file)
        cds_syn_dict.write(cds_syn_file)
Ejemplo n.º 23
0
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file")
parser.add_argument("-k",
                    "--family_column",
                    action="store",
                    dest="fam_col",
                    default=1,
                    type=int,
                    help="Family column position(0-based). Default: 1")
parser.add_argument("-a",
                    "--genes_column",
                    action="store",
                    dest="gen_col",
                    default=0,
                    type=int,
                    help="Genes column position(0-based). Default: 0")

args = parser.parse_args()

hit_dict = SynDict()

hit_dict.read(args.input,
              header=args.header,
              allow_repeats_of_key=True,
              key_index=args.fam_col,
              value_index=args.gen_col)

hit_dict.write(args.output, splited_values=True)
Ejemplo n.º 24
0
    os.system(exe_string)

os.system(awk_extract_ids_string % (pep_uniq_description_file, pep_uniq_ids))

syn_dict = SynDict()
syn_dict.read(pep_uniq_description_no_isoform_versions,
              header=False,
              separator="\t",
              allow_repeats_of_key=True,
              split_values=True,
              values_separator=",",
              key_index=1,
              value_index=0,
              comments_prefix="#")
syn_dict.write(pep_description_collapsed_isoforms,
               splited_values=True,
               values_separator=",")

length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input,
                                                         format="fasta",
                                                         out_file=len_file)

descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w")
descr_longest_isoform_fd = open(pep_description_longest_isoform, "w")
descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w")

for gene in syn_dict:
    len_list = []
    longest_isoform = None
    max_len = 0
    for isoform_id in syn_dict[gene]:
Ejemplo n.º 25
0
 def extract_hits_from_tbl_output(blast_hits, output_file):
     hits = SynDict()
     hits.read(blast_hits, allow_repeats_of_key=True, key_index=0, value_index=1, separator="\t")
     hits.write(output_file, splited_values=True, separator="\t", values_separator=",")
     return hits
Ejemplo n.º 26
0
out_fd = sys.stdout if args.output_prefix == "stdout" else open(
    "%s_reference_random_genes.ids" % args.output_prefix, "w")

reference_families = SynDict()
reference_families.read(args.reference_fam,
                        separator="\t",
                        split_values=True,
                        values_separator=",")

node_family_ids = IdList()
node_family_ids.read(args.input,
                     header=True,
                     column_number=0,
                     column_separator="\t")

reference_random_genes = SynDict()

for family_id in node_family_ids:
    if family_id not in reference_families:
        reference_random_genes[family_id] = "."
    else:
        reference_random_genes[family_id] = choice(
            reference_families[family_id])

reference_random_genes.write("%s_reference_random_genes.t" %
                             args.output_prefix)

for family_id in reference_random_genes:
    if reference_random_genes[family_id] != ".":
        out_fd.write("%s\n" % reference_random_genes[family_id])
Ejemplo n.º 27
0
                    "--remove_value_repeats",
                    action="store_true",
                    dest="remove_value_repeats",
                    help="Remove repeats of values")
args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

syn_dict = SynDict()
syn_dict.read(args.input,
              header=False,
              separator=args.column_separator,
              allow_repeats_of_key=True,
              split_values=True,
              values_separator=args.value_separator,
              key_index=args.key_column,
              value_index=args.value_column,
              comments_prefix=args.comments_prefix)

if args.remove_value_repeats:
    collapsed_dict = syn_dict.remove_value_repeats()
    collapsed_dict.write(out_fd,
                         splited_values=True,
                         values_separator=args.value_separator,
                         close_after_if_file_object=True)
else:
    syn_dict.write(out_fd,
                   splited_values=True,
                   values_separator=args.value_separator,
                   close_after_if_file_object=True)
#out_fd.close()