Ejemplo n.º 1
0
    def get_families_from_top_hits(top_hits_file, fam_file):

        hit_dict = SynDict()
        hit_dict.read(top_hits_file, allow_repeats_of_key=True, key_index=1, value_index=0, comments_prefix="#")
        hit_dict.write(fam_file, splited_values=True)

        return hit_dict
Ejemplo n.º 2
0
    def count_unique_positions_per_sequence_from_file(self,
                                                      alignment_file,
                                                      output_prefix,
                                                      format="fasta",
                                                      gap_symbol="-",
                                                      return_mode="absolute",
                                                      verbose=True):

        alignment = AlignIO.read(alignment_file, format=format)
        number_of_sequences = len(alignment)
        alignment_length = len(alignment[0])
        position_presence_matrix = self.get_position_presence_matrix(
            alignment, gap_symbol=gap_symbol, verbose=verbose)
        unique_position_count_dict = SynDict()
        unique_position_count_percent_dict = SynDict()

        for row in range(0, number_of_sequences):
            sequence_id = alignment[row].id
            unique_positions = 0
            for column in range(0, alignment_length):
                if (position_presence_matrix[row, column]
                        == 1) or (position_presence_matrix[row, column] == -1):
                    unique_positions += 1

            unique_position_count_dict[sequence_id] = unique_positions
            unique_position_count_percent_dict[sequence_id] = 100 * float(
                unique_positions) / (alignment_length -
                                     str(alignment[row].seq).count(gap_symbol))

        unique_position_count_dict.write("%s.absolute_counts" % output_prefix)
        unique_position_count_percent_dict.write("%s.percent_counts" %
                                                 output_prefix)

        return unique_position_count_dict if return_mode == "absolute" else unique_position_count_percent_dict
Ejemplo n.º 3
0
 def extract_dom_names_hits_from_domtblout(domtblout_file, output_file):
     hits_dict = SynDict()
     hits_dict.read(domtblout_file, header=False, separator=None, allow_repeats_of_key=True,
                    key_index=3, value_index=0, comments_prefix="#")
     if output_file:
         hits_dict.write(output_file, splited_values=True)
     return hits_dict
Ejemplo n.º 4
0
    def get_transcript_to_pep_accordance_from_gtf(gtf_file,
                                                  output_file,
                                                  comment_symbol="#"):
        """
        Tested on gtf files from Ensembl relealese 70
        """
        accordance_dict = SynDict()
        with open(gtf_file, "r") as gtf_fd:
            for line in gtf_fd:
                if line[0] == comment_symbol:
                    continue
                tmp_list = line.strip().split("\t")
                tmp_list = tmp_list[-1].split(";")
                protein_id = None
                transcript_id = None
                #print tmp_list
                for entry in tmp_list:
                    tmp_entry = entry.split()

                    if len(tmp_entry) != 2:
                        continue
                    if tmp_entry[0] == "transcript_id":
                        #print "tttt"
                        transcript_id = tmp_entry[1][1:-1]  # remove quotes
                    elif tmp_entry[0] == "protein_id":
                        #print "ppppp"
                        protein_id = tmp_entry[1][1:-1]

                if (transcript_id is not None) and (protein_id is not None):
                    if transcript_id in accordance_dict:
                        accordance_dict[transcript_id].add(protein_id)
                    else:
                        accordance_dict[transcript_id] = {protein_id}
        accordance_dict.write(output_file, splited_values=True)
Ejemplo n.º 5
0
 def extract_hits_from_tbl_output(blast_hits, output_file):
     hits = SynDict()
     hits.read(blast_hits,
               allow_repeats_of_key=True,
               key_index=0,
               value_index=1,
               separator="\t")
     hits.write(output_file,
                splited_values=True,
                separator="\t",
                values_separator=",")
     return hits
Ejemplo n.º 6
0
    def rename_elements_in_clusters(
            clusters_file,
            syn_file,
            output_clusters_file,
            remove_clusters_with_not_renamed_elements=False,
            elements_with_absent_synonyms_file=None,
            syn_file_key_column_index=0,
            syn_file_value_column_index=1,
            syn_file_column_separator='\t'):
        syn_dict = SynDict()
        syn_dict.read(syn_file,
                      comments_prefix="#",
                      key_index=syn_file_key_column_index,
                      value_index=syn_file_value_column_index,
                      separator=syn_file_column_separator)

        clusters_dict = SynDict()
        clusters_dict.read(clusters_file,
                           split_values=True,
                           values_separator=",",
                           comments_prefix="#")

        output_clusters_dict = SynDict()

        absent_elements_dict = SynDict()

        for cluster in clusters_dict:
            renamed_element_list = []
            all_elements_were_renamed_flag = True
            for element in clusters_dict[cluster]:
                if element in syn_dict:
                    renamed_element_list.append(syn_dict[element])
                else:
                    if cluster not in absent_elements_dict:
                        absent_elements_dict[cluster] = [element]
                    else:
                        absent_elements_dict[cluster].append(element)
                    all_elements_were_renamed_flag = False
                    renamed_element_list.append(element)

            if (not remove_clusters_with_not_renamed_elements) or (
                    remove_clusters_with_not_renamed_elements
                    and all_elements_were_renamed_flag):
                output_clusters_dict[cluster] = renamed_element_list

        output_clusters_dict.write(output_clusters_file, splited_values=True)

        if elements_with_absent_synonyms_file:
            absent_elements_dict.write(elements_with_absent_synonyms_file,
                                       splited_values=True)

        return absent_elements_dict
Ejemplo n.º 7
0
    def replace_augustus_ids(augustus_gff,
                             output_prefix,
                             species_prefix=None,
                             number_of_digits_in_id=8):

        output_gff = "%s.renamed.gff" % output_prefix
        genes_syn_file = "%s.gene.syn" % output_prefix
        transcripts_syn_file = "%s.transcript.syn" % output_prefix
        cds_syn_file = "%s.cds.syn" % output_prefix
        genes_syn_dict = SynDict()
        transcripts_syn_dict = SynDict()
        cds_syn_dict = SynDict()
        gene_counter = 0
        gene_id_template = "%sG%%0%ii" % (species_prefix,
                                          number_of_digits_in_id)
        transcripts_counter = 0
        transcript_id_template = "%sT%%0%ii" % (species_prefix,
                                                number_of_digits_in_id)
        cds_counter = 0
        cds_id_template = "%sC%%0%ii" % (species_prefix,
                                         number_of_digits_in_id)
        with open(augustus_gff, "r") as in_fd:
            with open(output_gff, "w") as out_fd:
                for line in in_fd:
                    tmp = line.strip()
                    if len(tmp) < 13:
                        out_fd.write(line)
                        continue
                    if tmp[:12] != "# start gene":
                        out_fd.write(line)
                        continue
                    augustus_gene_id = tmp.split(" ")[-1]
                    gene_counter += 1

                    gene_syn_id = gene_id_template % gene_counter
                    genes_syn_dict[augustus_gene_id] = gene_syn_id
                    augustus_transcript_id = ""
                    augustus_transcript_parent = ""
                    out_fd.write("# start gene %s\n" % gene_syn_id)
                    tmp = in_fd.next().strip()
                    while True:
                        while tmp[0] != "#":
                            tmp_list = tmp.split("\t")
                            feature_type = tmp_list[2]
                            edited_str = "\t".join(tmp_list[:-1])
                            info_field_list = tmp_list[-1].split(";")
                            if feature_type == "gene":
                                edited_str += "\tID=%s\n" % gene_syn_id
                            elif feature_type == "transcript":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_transcript_id = entry.split(
                                            "=")[-1]
                                        if augustus_transcript_id not in transcripts_syn_dict:
                                            transcripts_counter += 1
                                            transcripts_syn_dict[
                                                augustus_transcript_id] = transcript_id_template % transcripts_counter
                                        transcript_syn_id = transcripts_syn_dict[
                                            augustus_transcript_id]
                                    if "Parent" in entry:
                                        augustus_transcript_parent = entry.split(
                                            "=")[-1]
                                        if augustus_transcript_parent != augustus_gene_id:
                                            raise ValueError(
                                                "Transcript parent id and gene id are not same!"
                                            )
                                edited_str += "\tID=%s;Parent=%s\n" % (
                                    transcript_syn_id, gene_syn_id)
                            elif feature_type == "CDS":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_cds_id = entry.split("=")[-1]
                                        if augustus_cds_id not in cds_syn_dict:
                                            cds_counter += 1
                                            cds_syn_dict[
                                                augustus_cds_id] = cds_id_template % cds_counter
                                        cds_syn_id = cds_syn_dict[
                                            augustus_cds_id]
                                    if "Parent" in entry:
                                        augustus_cds_parent = entry.split(
                                            "=")[-1]
                                        if augustus_cds_parent != augustus_transcript_id:
                                            raise ValueError(
                                                "CDS parent id and transcript id are not same!"
                                            )
                                edited_str += "\tID=%s;Parent=%s\n" % (
                                    cds_syn_id, transcript_syn_id)
                            elif (feature_type
                                  == "stop_codon") or (feature_type
                                                       == "start_codon"):
                                for entry in info_field_list:
                                    if "Parent" in entry:
                                        augustus_feature_parent = entry.split(
                                            "=")[-1]
                                        if augustus_feature_parent != augustus_transcript_id:
                                            raise ValueError(
                                                "Feature parent id and transcript id are not same!"
                                            )
                                edited_str += "\tParent=%s\n" % transcript_syn_id
                            else:
                                edited_str = tmp + "\n"

                            out_fd.write(edited_str)
                            tmp = in_fd.next().strip()
                        while tmp[0] == "#":
                            if "# end gene" in tmp:
                                break
                            out_fd.write(tmp + "\n")
                            tmp = in_fd.next().strip()
                        if "# end gene" in tmp:
                            break
                    out_fd.write("# end gene %s\n" % gene_syn_id)
        genes_syn_dict.write(genes_syn_file)
        transcripts_syn_dict.write(transcripts_syn_file)
        cds_syn_dict.write(cds_syn_file)
Ejemplo n.º 8
0
    def count_miRNA_reads(self,
                          alignment_file,
                          gff_file,
                          output_prefix,
                          annotation_file_type="GTF",
                          min_read_fraction_overlap=1.0,
                          feature_type_to_use=None,
                          attribute_type_to_use=None,
                          sample_name=None,
                          stranded=1):

        no_multimapped_read_counts = "%s.no_multimapped_reads.count" % output_prefix
        with_multimapped_read_counts = "%s.with_multimapped_reads.count" % output_prefix
        all_adjusted_read_counts = "%s.all_adjusted_reads.count" % output_prefix

        self.count(alignment_file,
                   gff_file,
                   no_multimapped_read_counts,
                   annotation_file_type=annotation_file_type,
                   min_read_fraction_overlap=min_read_fraction_overlap,
                   feature_type_to_use=feature_type_to_use,
                   attribute_type_to_use=attribute_type_to_use,
                   stranded=stranded)

        self.count(alignment_file,
                   gff_file,
                   with_multimapped_read_counts,
                   count_multimapped_reads=True,
                   annotation_file_type=annotation_file_type,
                   min_read_fraction_overlap=min_read_fraction_overlap,
                   feature_type_to_use=feature_type_to_use,
                   attribute_type_to_use=attribute_type_to_use,
                   stranded=stranded)

        no_multimapped_read_count_dict = SynDict(
            filename=no_multimapped_read_counts,
            comments_prefix="#",
            key_index=0,
            value_index=6,
            expression=int,
            header=True)
        with_multimapped_read_count_dict = SynDict(
            filename=with_multimapped_read_counts,
            comments_prefix="#",
            key_index=0,
            value_index=6,
            expression=int,
            header=True)
        similar_feature_number_dict = SynDict(
            filename=with_multimapped_read_counts,
            comments_prefix="#",
            header=True,
            key_index=0,
            value_index=1,
            expression=lambda s: len(s.split(";")))

        sample_nameeee = sample_name if sample_name else similar_feature_number_dict.header.split(
        )[6]

        all_adjusted_read_count_dict = SynDict()
        all_adjusted_read_count_dict.header = ".\t%s" % sample_nameeee

        #print no_multimapped_read_count_dict
        #print with_multimapped_read_count_dict
        #print similar_feature_number_dict

        for feature_id in no_multimapped_read_count_dict:
            all_adjusted_read_count_dict[feature_id] = int(ceil(float(no_multimapped_read_count_dict[feature_id]) + \
                                                            (float(with_multimapped_read_count_dict[feature_id]) - float(no_multimapped_read_count_dict[feature_id])) / float(similar_feature_number_dict[feature_id])))

        all_adjusted_read_count_dict.write(all_adjusted_read_counts,
                                           header=True)
Ejemplo n.º 9
0
    def draw_length_histogram(sequence_dict,
                              output_prefix,
                              number_of_bins=None,
                              width_of_bins=None,
                              min_length=1,
                              max_length=None,
                              extensions=("png", "svg"),
                              legend_location='best'):
        length_dict = SynDict()

        for record in sequence_dict:
            length_dict[record] = len(sequence_dict[record].seq)

        length_dict.write("%s.len" % output_prefix)

        lengths = length_dict.values()

        max_len = max(lengths)
        min_len = min(lengths)
        median = np.median(lengths)
        mean = np.mean(lengths)

        if max_length is None:
            maximum_length = max_len
        else:
            maximum_length = max_length

        filtered = []

        if (maximum_length < max_len) and (min_length > 1):
            for entry in lengths:
                if min_length <= entry <= maximum_length:
                    filtered.append(entry)
        elif min_length > 1:
            for entry in lengths:
                if min_length <= entry:
                    filtered.append(entry)
        elif maximum_length < max_len:
            for entry in lengths:
                if entry <= maximum_length:
                    filtered.append(entry)
        else:
            filtered = lengths

        plt.figure(1, figsize=(6, 6))
        plt.subplot(1, 1, 1)

        if number_of_bins:
            bins = number_of_bins
        elif width_of_bins:
            bins = np.arange(min_length - 1,
                             maximum_length,
                             width_of_bins,
                             dtype=np.int32)
            bins[0] += 1
            bins = np.append(bins, [maximum_length])
        else:
            bins = 30
        plt.hist(filtered, bins=bins)
        plt.xlim(xmin=min_length, xmax=maximum_length)
        plt.xlabel("Length")
        plt.ylabel("N")
        plt.title("Distribution of sequence lengths")
        plt.legend(("Min: %i\nMax: %i\nMean: %i\nMedian: %i" %
                    (min_len, max_len, mean, median), ),
                   loc=legend_location)
        for ext in extensions:
            plt.savefig("%s.%s" % (output_prefix, ext))

        os.remove("temp.idx")
Ejemplo n.º 10
0
    os.system(exe_string)

os.system(awk_extract_ids_string % (pep_uniq_description_file, pep_uniq_ids))

syn_dict = SynDict()
syn_dict.read(pep_uniq_description_no_isoform_versions,
              header=False,
              separator="\t",
              allow_repeats_of_key=True,
              split_values=True,
              values_separator=",",
              key_index=1,
              value_index=0,
              comments_prefix="#")
syn_dict.write(pep_description_collapsed_isoforms,
               splited_values=True,
               values_separator=",")

length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input,
                                                         format="fasta",
                                                         out_file=len_file)

descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w")
descr_longest_isoform_fd = open(pep_description_longest_isoform, "w")
descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w")

for gene in syn_dict:
    len_list = []
    longest_isoform = None
    max_len = 0
    for isoform_id in syn_dict[gene]: