def get_families_from_top_hits(top_hits_file, fam_file): hit_dict = SynDict() hit_dict.read(top_hits_file, allow_repeats_of_key=True, key_index=1, value_index=0, comments_prefix="#") hit_dict.write(fam_file, splited_values=True) return hit_dict
def count_unique_positions_per_sequence_from_file(self, alignment_file, output_prefix, format="fasta", gap_symbol="-", return_mode="absolute", verbose=True): alignment = AlignIO.read(alignment_file, format=format) number_of_sequences = len(alignment) alignment_length = len(alignment[0]) position_presence_matrix = self.get_position_presence_matrix( alignment, gap_symbol=gap_symbol, verbose=verbose) unique_position_count_dict = SynDict() unique_position_count_percent_dict = SynDict() for row in range(0, number_of_sequences): sequence_id = alignment[row].id unique_positions = 0 for column in range(0, alignment_length): if (position_presence_matrix[row, column] == 1) or (position_presence_matrix[row, column] == -1): unique_positions += 1 unique_position_count_dict[sequence_id] = unique_positions unique_position_count_percent_dict[sequence_id] = 100 * float( unique_positions) / (alignment_length - str(alignment[row].seq).count(gap_symbol)) unique_position_count_dict.write("%s.absolute_counts" % output_prefix) unique_position_count_percent_dict.write("%s.percent_counts" % output_prefix) return unique_position_count_dict if return_mode == "absolute" else unique_position_count_percent_dict
def extract_dom_names_hits_from_domtblout(domtblout_file, output_file): hits_dict = SynDict() hits_dict.read(domtblout_file, header=False, separator=None, allow_repeats_of_key=True, key_index=3, value_index=0, comments_prefix="#") if output_file: hits_dict.write(output_file, splited_values=True) return hits_dict
def get_transcript_to_pep_accordance_from_gtf(gtf_file, output_file, comment_symbol="#"): """ Tested on gtf files from Ensembl relealese 70 """ accordance_dict = SynDict() with open(gtf_file, "r") as gtf_fd: for line in gtf_fd: if line[0] == comment_symbol: continue tmp_list = line.strip().split("\t") tmp_list = tmp_list[-1].split(";") protein_id = None transcript_id = None #print tmp_list for entry in tmp_list: tmp_entry = entry.split() if len(tmp_entry) != 2: continue if tmp_entry[0] == "transcript_id": #print "tttt" transcript_id = tmp_entry[1][1:-1] # remove quotes elif tmp_entry[0] == "protein_id": #print "ppppp" protein_id = tmp_entry[1][1:-1] if (transcript_id is not None) and (protein_id is not None): if transcript_id in accordance_dict: accordance_dict[transcript_id].add(protein_id) else: accordance_dict[transcript_id] = {protein_id} accordance_dict.write(output_file, splited_values=True)
def extract_hits_from_tbl_output(blast_hits, output_file): hits = SynDict() hits.read(blast_hits, allow_repeats_of_key=True, key_index=0, value_index=1, separator="\t") hits.write(output_file, splited_values=True, separator="\t", values_separator=",") return hits
def rename_elements_in_clusters( clusters_file, syn_file, output_clusters_file, remove_clusters_with_not_renamed_elements=False, elements_with_absent_synonyms_file=None, syn_file_key_column_index=0, syn_file_value_column_index=1, syn_file_column_separator='\t'): syn_dict = SynDict() syn_dict.read(syn_file, comments_prefix="#", key_index=syn_file_key_column_index, value_index=syn_file_value_column_index, separator=syn_file_column_separator) clusters_dict = SynDict() clusters_dict.read(clusters_file, split_values=True, values_separator=",", comments_prefix="#") output_clusters_dict = SynDict() absent_elements_dict = SynDict() for cluster in clusters_dict: renamed_element_list = [] all_elements_were_renamed_flag = True for element in clusters_dict[cluster]: if element in syn_dict: renamed_element_list.append(syn_dict[element]) else: if cluster not in absent_elements_dict: absent_elements_dict[cluster] = [element] else: absent_elements_dict[cluster].append(element) all_elements_were_renamed_flag = False renamed_element_list.append(element) if (not remove_clusters_with_not_renamed_elements) or ( remove_clusters_with_not_renamed_elements and all_elements_were_renamed_flag): output_clusters_dict[cluster] = renamed_element_list output_clusters_dict.write(output_clusters_file, splited_values=True) if elements_with_absent_synonyms_file: absent_elements_dict.write(elements_with_absent_synonyms_file, splited_values=True) return absent_elements_dict
def replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8): output_gff = "%s.renamed.gff" % output_prefix genes_syn_file = "%s.gene.syn" % output_prefix transcripts_syn_file = "%s.transcript.syn" % output_prefix cds_syn_file = "%s.cds.syn" % output_prefix genes_syn_dict = SynDict() transcripts_syn_dict = SynDict() cds_syn_dict = SynDict() gene_counter = 0 gene_id_template = "%sG%%0%ii" % (species_prefix, number_of_digits_in_id) transcripts_counter = 0 transcript_id_template = "%sT%%0%ii" % (species_prefix, number_of_digits_in_id) cds_counter = 0 cds_id_template = "%sC%%0%ii" % (species_prefix, number_of_digits_in_id) with open(augustus_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: tmp = line.strip() if len(tmp) < 13: out_fd.write(line) continue if tmp[:12] != "# start gene": out_fd.write(line) continue augustus_gene_id = tmp.split(" ")[-1] gene_counter += 1 gene_syn_id = gene_id_template % gene_counter genes_syn_dict[augustus_gene_id] = gene_syn_id augustus_transcript_id = "" augustus_transcript_parent = "" out_fd.write("# start gene %s\n" % gene_syn_id) tmp = in_fd.next().strip() while True: while tmp[0] != "#": tmp_list = tmp.split("\t") feature_type = tmp_list[2] edited_str = "\t".join(tmp_list[:-1]) info_field_list = tmp_list[-1].split(";") if feature_type == "gene": edited_str += "\tID=%s\n" % gene_syn_id elif feature_type == "transcript": for entry in info_field_list: if "ID" in entry: augustus_transcript_id = entry.split( "=")[-1] if augustus_transcript_id not in transcripts_syn_dict: transcripts_counter += 1 transcripts_syn_dict[ augustus_transcript_id] = transcript_id_template % transcripts_counter transcript_syn_id = transcripts_syn_dict[ augustus_transcript_id] if "Parent" in entry: augustus_transcript_parent = entry.split( "=")[-1] if augustus_transcript_parent != augustus_gene_id: raise ValueError( "Transcript parent id and gene id are not same!" ) edited_str += "\tID=%s;Parent=%s\n" % ( transcript_syn_id, gene_syn_id) elif feature_type == "CDS": for entry in info_field_list: if "ID" in entry: augustus_cds_id = entry.split("=")[-1] if augustus_cds_id not in cds_syn_dict: cds_counter += 1 cds_syn_dict[ augustus_cds_id] = cds_id_template % cds_counter cds_syn_id = cds_syn_dict[ augustus_cds_id] if "Parent" in entry: augustus_cds_parent = entry.split( "=")[-1] if augustus_cds_parent != augustus_transcript_id: raise ValueError( "CDS parent id and transcript id are not same!" ) edited_str += "\tID=%s;Parent=%s\n" % ( cds_syn_id, transcript_syn_id) elif (feature_type == "stop_codon") or (feature_type == "start_codon"): for entry in info_field_list: if "Parent" in entry: augustus_feature_parent = entry.split( "=")[-1] if augustus_feature_parent != augustus_transcript_id: raise ValueError( "Feature parent id and transcript id are not same!" ) edited_str += "\tParent=%s\n" % transcript_syn_id else: edited_str = tmp + "\n" out_fd.write(edited_str) tmp = in_fd.next().strip() while tmp[0] == "#": if "# end gene" in tmp: break out_fd.write(tmp + "\n") tmp = in_fd.next().strip() if "# end gene" in tmp: break out_fd.write("# end gene %s\n" % gene_syn_id) genes_syn_dict.write(genes_syn_file) transcripts_syn_dict.write(transcripts_syn_file) cds_syn_dict.write(cds_syn_file)
def count_miRNA_reads(self, alignment_file, gff_file, output_prefix, annotation_file_type="GTF", min_read_fraction_overlap=1.0, feature_type_to_use=None, attribute_type_to_use=None, sample_name=None, stranded=1): no_multimapped_read_counts = "%s.no_multimapped_reads.count" % output_prefix with_multimapped_read_counts = "%s.with_multimapped_reads.count" % output_prefix all_adjusted_read_counts = "%s.all_adjusted_reads.count" % output_prefix self.count(alignment_file, gff_file, no_multimapped_read_counts, annotation_file_type=annotation_file_type, min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use, attribute_type_to_use=attribute_type_to_use, stranded=stranded) self.count(alignment_file, gff_file, with_multimapped_read_counts, count_multimapped_reads=True, annotation_file_type=annotation_file_type, min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use, attribute_type_to_use=attribute_type_to_use, stranded=stranded) no_multimapped_read_count_dict = SynDict( filename=no_multimapped_read_counts, comments_prefix="#", key_index=0, value_index=6, expression=int, header=True) with_multimapped_read_count_dict = SynDict( filename=with_multimapped_read_counts, comments_prefix="#", key_index=0, value_index=6, expression=int, header=True) similar_feature_number_dict = SynDict( filename=with_multimapped_read_counts, comments_prefix="#", header=True, key_index=0, value_index=1, expression=lambda s: len(s.split(";"))) sample_nameeee = sample_name if sample_name else similar_feature_number_dict.header.split( )[6] all_adjusted_read_count_dict = SynDict() all_adjusted_read_count_dict.header = ".\t%s" % sample_nameeee #print no_multimapped_read_count_dict #print with_multimapped_read_count_dict #print similar_feature_number_dict for feature_id in no_multimapped_read_count_dict: all_adjusted_read_count_dict[feature_id] = int(ceil(float(no_multimapped_read_count_dict[feature_id]) + \ (float(with_multimapped_read_count_dict[feature_id]) - float(no_multimapped_read_count_dict[feature_id])) / float(similar_feature_number_dict[feature_id]))) all_adjusted_read_count_dict.write(all_adjusted_read_counts, header=True)
def draw_length_histogram(sequence_dict, output_prefix, number_of_bins=None, width_of_bins=None, min_length=1, max_length=None, extensions=("png", "svg"), legend_location='best'): length_dict = SynDict() for record in sequence_dict: length_dict[record] = len(sequence_dict[record].seq) length_dict.write("%s.len" % output_prefix) lengths = length_dict.values() max_len = max(lengths) min_len = min(lengths) median = np.median(lengths) mean = np.mean(lengths) if max_length is None: maximum_length = max_len else: maximum_length = max_length filtered = [] if (maximum_length < max_len) and (min_length > 1): for entry in lengths: if min_length <= entry <= maximum_length: filtered.append(entry) elif min_length > 1: for entry in lengths: if min_length <= entry: filtered.append(entry) elif maximum_length < max_len: for entry in lengths: if entry <= maximum_length: filtered.append(entry) else: filtered = lengths plt.figure(1, figsize=(6, 6)) plt.subplot(1, 1, 1) if number_of_bins: bins = number_of_bins elif width_of_bins: bins = np.arange(min_length - 1, maximum_length, width_of_bins, dtype=np.int32) bins[0] += 1 bins = np.append(bins, [maximum_length]) else: bins = 30 plt.hist(filtered, bins=bins) plt.xlim(xmin=min_length, xmax=maximum_length) plt.xlabel("Length") plt.ylabel("N") plt.title("Distribution of sequence lengths") plt.legend(("Min: %i\nMax: %i\nMean: %i\nMedian: %i" % (min_len, max_len, mean, median), ), loc=legend_location) for ext in extensions: plt.savefig("%s.%s" % (output_prefix, ext)) os.remove("temp.idx")
os.system(exe_string) os.system(awk_extract_ids_string % (pep_uniq_description_file, pep_uniq_ids)) syn_dict = SynDict() syn_dict.read(pep_uniq_description_no_isoform_versions, header=False, separator="\t", allow_repeats_of_key=True, split_values=True, values_separator=",", key_index=1, value_index=0, comments_prefix="#") syn_dict.write(pep_description_collapsed_isoforms, splited_values=True, values_separator=",") length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input, format="fasta", out_file=len_file) descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w") descr_longest_isoform_fd = open(pep_description_longest_isoform, "w") descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w") for gene in syn_dict: len_list = [] longest_isoform = None max_len = 0 for isoform_id in syn_dict[gene]: