def correct_regions_from_gff( self, reference, variants_vcf, gff_file, output_prefix=None, feature_type_list=["CDS"], unification_key="Parent", #raw_seq_per_line=False, vcf_with_masking=None, override_vcf_by_mask=None, use_ambiguous_nuccleotides=None): feature_dict = AnnotationsRoutines.get_feature_dict( gff_file, output_prefix=output_prefix, feature_type_list=feature_type_list, unification_key=unification_key) region_file = "%s.coordinates_only.list" % output_prefix raw_regions = "%s.raw.seq" % output_prefix final_regions = "%s.fasta" % output_prefix regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix self.correct_reference( reference, raw_regions, variants_vcf, raw_seq_per_line=True, vcf_with_masking=vcf_with_masking, override_vcf_by_mask=override_vcf_by_mask, use_ambiguous_nuccleotides=use_ambiguous_nuccleotides, interval_list=region_file) region_with_frameshift = SynDict() def new_regions_generator(): with open(raw_regions, "r") as in_fd: for region_id in feature_dict: seq = "" for i in range(0, len(feature_dict[region_id])): seq_fragment = in_fd.readline().strip() if ((int(feature_dict[region_id][i][2]) - int(feature_dict[region_id][i][1]) + 1) - len(seq_fragment)) % 3 != 0: if region_id not in region_with_frameshift: region_with_frameshift[region_id] = [i] else: region_with_frameshift[region_id].append(i) seq += seq_fragment yield SeqRecord( seq=Seq(seq) if feature_dict[region_id][0][3] == "+" else Seq(seq).reverse_complement(), id=region_id, description="") SeqIO.write(new_regions_generator(), final_regions, format="fasta") region_with_frameshift.write(regions_with_frameshift_file, splited_values=True)
def get_transcript_to_pep_accordance_from_gtf(gtf_file, output_file, comment_symbol="#"): """ Tested on gtf files from Ensembl relealese 70 """ accordance_dict = SynDict() with open(gtf_file, "r") as gtf_fd: for line in gtf_fd: if line[0] == comment_symbol: continue tmp_list = line.strip().split("\t") tmp_list = tmp_list[-1].split(";") protein_id = None transcript_id = None #print tmp_list for entry in tmp_list: tmp_entry = entry.split() if len(tmp_entry) != 2: continue if tmp_entry[0] == "transcript_id": #print ("tttt") transcript_id = tmp_entry[1][1:-1] # remove quotes elif tmp_entry[0] == "protein_id": #print ("ppppp") protein_id = tmp_entry[1][1:-1] if (transcript_id is not None) and (protein_id is not None): if transcript_id in accordance_dict: accordance_dict[transcript_id].add(protein_id) else: accordance_dict[transcript_id] = {protein_id} accordance_dict.write(output_file, splited_values=True)
def count_per_scaffold_feature_number(gff_file, out_file=None, feature_type_list=[]): feature_count_dict = SynDict() if feature_type_list: def check_feature_type(feature_type): return feature_type in feature_type_list else: def check_feature_type(feature_type): return True with open(gff_file, "r") as gff_fd: for line in gff_fd: if line[0] == "#": continue line_list = line.split("\t") if check_feature_type(line_list[2]): if line_list[0] in feature_count_dict: feature_count_dict[line_list[0]] += 1 else: feature_count_dict[line_list[0]] = 1 if out_file: feature_count_dict.write(out_file) return feature_count_dict
def rename_elements_in_clusters( clusters_file, syn_file, output_clusters_file, remove_clusters_with_not_renamed_elements=False, elements_with_absent_synonyms_file=None, syn_file_key_column_index=0, syn_file_value_column_index=1, syn_file_column_separator='\t', keep_only_unique_elements=False): syn_dict = SynDict() syn_dict.read(syn_file, comments_prefix="#", key_index=syn_file_key_column_index, value_index=syn_file_value_column_index, separator=syn_file_column_separator) clusters_dict = SynDict() clusters_dict.read(clusters_file, split_values=True, values_separator=",", comments_prefix="#") output_clusters_dict = SynDict() absent_elements_dict = SynDict() for cluster in clusters_dict: renamed_element_list = [] all_elements_were_renamed_flag = True for element in clusters_dict[cluster]: if element in syn_dict: renamed_element_list.append(syn_dict[element]) else: if cluster not in absent_elements_dict: absent_elements_dict[cluster] = [element] else: absent_elements_dict[cluster].append(element) all_elements_were_renamed_flag = False renamed_element_list.append(element) if (not remove_clusters_with_not_renamed_elements) or ( remove_clusters_with_not_renamed_elements and all_elements_were_renamed_flag): output_clusters_dict[cluster] = set( renamed_element_list ) if keep_only_unique_elements else renamed_element_list output_clusters_dict.write(output_clusters_file, splited_values=True) if elements_with_absent_synonyms_file: absent_elements_dict.write(elements_with_absent_synonyms_file, splited_values=True) return absent_elements_dict
def get_monomer_len_file_from_trf_gff(trf_gff, len_file): len_dict = SynDict() with open(trf_gff, "r") as trf_fd: for line in trf_fd: if line[0] == "#": continue description_dict = AnnotationsRoutines.get_description_dict_from_gff_string(line) len_dict[description_dict["ID"]] = description_dict["Period"] # print len_dict len_dict.write(len_file)
def get_families_from_top_hits(top_hits_file, fam_file): hit_dict = SynDict() hit_dict.read(top_hits_file, allow_repeats_of_key=True, key_index=1, value_index=0, comments_prefix="#") hit_dict.write(fam_file, splited_values=True) return hit_dict
def extract_dom_names_hits_from_domtblout(domtblout_file, output_file): hits_dict = SynDict() hits_dict.read(domtblout_file, header=False, separator=None, allow_repeats_of_key=True, key_index=3, value_index=0, comments_prefix="#") if output_file: hits_dict.write(output_file, splited_values=True) return hits_dict
def syn2fam(syn_file, fam_file, key_column=0, value_column=1, separator="\t"): syn_dict = SynDict(filename=syn_file, allow_repeats_of_key=True, key_index=key_column, value_index=value_column, separator=separator, split_values=True) syn_dict.write(fam_file, splited_values=True)
def extract_GO_terms_from_emapper_annotation_file(emapper_annotation_file, output_file): GO_terms_dict = SynDict(filename=emapper_annotation_file, key_index=0, value_index=5, split_values=True, values_separator=",", comments_prefix="#", separator="\t") GO_terms_dict.header = "#protein_id\tGO_terms" GO_terms_dict.write(output_file, header=True, splited_values=True) return GO_terms_dict
def extract_predicted_gene_names_from_emapper_annotation_file( emapper_annotation_file, output_file): extract_predicted_gene_names_dict = SynDict( filename=emapper_annotation_file, key_index=0, value_index=4, split_values=True, values_separator=",", comments_prefix="#", separator="\t") extract_predicted_gene_names_dict.header = "#protein_id\tpredicted_gene_name" extract_predicted_gene_names_dict.write(output_file, header=True, splited_values=True) return extract_predicted_gene_names_dict
def convert_emapper_annotation_file_to_fam(emapper_annotation_file, output_fam, eggnogdb_prefix=None, species_name=None, label_separator="@", diamond_mode=False, database=None): fam_dict = SynDict() if diamond_mode and (database is not None): def extract_fam_from_line(line_list): db_dict = dict( map(lambda s: s.split("@")[::-1], line_list[9].split(","))) return db_dict[database] if database in db_dict else "unknown" elif diamond_mode: raise ValueError( "ERROR!!! Database name (veNOG or other) is required in diamond mode!" ) else: def extract_fam_from_line(line_list): return line_list[10].split("|")[0] with open(emapper_annotation_file, "r") as annotations_fd: for line in annotations_fd: if line[0] == "#": continue line_list = line.split("\t") fam_id = extract_fam_from_line(line_list) if not (eggnogdb_prefix is None): fam_id = eggnogdb_prefix + fam_id gene_id = "%s%s%s" % ( species_name, label_separator, line_list[0]) if species_name else line_list[0] if fam_id in fam_dict: fam_dict[fam_id].append(gene_id) else: fam_dict[fam_id] = [gene_id] fam_dict.write(filename=output_fam, splited_values=True)
def get_feature_dict(self, input_gff, output_prefix=None, feature_type_list=["CDS"], unification_key="Parent"): feature_dict = SynDict() for line_list in self.file_line_as_list_generator(input_gff, comments_prefix="#", separator="\t"): annotation_dict = self.parse_gff_annotation_string_to_dict( line_list[self.GFF_ATTRIBUTE_COLUMN]) if line_list[self.GFF_FEATURETYPE_COLUMN] not in feature_type_list: continue if unification_key not in annotation_dict: continue #print unification_key #print(annotation_dict) if annotation_dict[unification_key][0] not in feature_dict: feature_dict[annotation_dict[unification_key][0]] = [] feature_dict[annotation_dict[unification_key][0]].append([ line_list[self.GFF_SCAFFOLD_COLUMN], line_list[self.GFF_START_COLUMN], line_list[self.GFF_END_COLUMN], line_list[self.GFF_STRAND_COLUMN] ]) if output_prefix: feature_dict.write( "%s.tab" % output_prefix, value_expression=self.feature_list_entry_to_tab_str, line_per_value=True) feature_dict.write( "%s.coordinates_only.tab" % output_prefix, value_expression=self.feature_list_entry_to_tab_str, line_per_value=True, values_only=True) feature_dict.write( "%s.list" % output_prefix, value_expression=self.feature_list_entry_to_gatk_interval_str, line_per_value=True) feature_dict.write( "%s.coordinates_only.list" % output_prefix, value_expression=self.feature_list_entry_to_gatk_interval_str, line_per_value=True, values_only=True) return feature_dict
def count_miRNA_reads(self, alignment_file, gff_file, output_prefix, annotation_file_type="GTF", min_read_fraction_overlap=1.0, feature_type_to_use=None, attribute_type_to_use=None, sample_name=None, stranded=1): no_multimapped_read_counts = "%s.no_multimapped_reads.count" % output_prefix with_multimapped_read_counts = "%s.with_multimapped_reads.count" % output_prefix all_adjusted_read_counts = "%s.all_adjusted_reads.count" % output_prefix self.count(alignment_file, gff_file, no_multimapped_read_counts, annotation_file_type=annotation_file_type, min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use, attribute_type_to_use=attribute_type_to_use, stranded=stranded) self.count(alignment_file, gff_file, with_multimapped_read_counts, count_multimapped_reads=True, annotation_file_type=annotation_file_type, min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use, attribute_type_to_use=attribute_type_to_use, stranded=stranded) no_multimapped_read_count_dict = SynDict(filename=no_multimapped_read_counts, comments_prefix="#", key_index=0, value_index=6, expression=int, header=True) with_multimapped_read_count_dict = SynDict(filename=with_multimapped_read_counts, comments_prefix="#", key_index=0, value_index=6, expression=int, header=True) similar_feature_number_dict = SynDict(filename=with_multimapped_read_counts, comments_prefix="#", header=True, key_index=0, value_index=1, expression=lambda s: len(s.split(";"))) sample_nameeee = sample_name if sample_name else similar_feature_number_dict.header.split()[6] all_adjusted_read_count_dict = SynDict() all_adjusted_read_count_dict.header = ".\t%s" % sample_nameeee #print no_multimapped_read_count_dict #print with_multimapped_read_count_dict #print similar_feature_number_dict for feature_id in no_multimapped_read_count_dict: all_adjusted_read_count_dict[feature_id] = int(ceil(float(no_multimapped_read_count_dict[feature_id]) + \ (float(with_multimapped_read_count_dict[feature_id]) - float(no_multimapped_read_count_dict[feature_id])) / float(similar_feature_number_dict[feature_id]))) all_adjusted_read_count_dict.write(all_adjusted_read_counts, header=True)
def count_column_values_from_file(self, input_file, column_number, output_file=None, separator="\t", comments_prefix="#", verbose=False): column_value_dict = SynDict() for line_list in self.file_line_as_list_generator( input_file, separator=separator, comments_prefix=comments_prefix): if line_list[column_number] in column_value_dict: column_value_dict[line_list[column_number]] += 1 else: column_value_dict[line_list[column_number]] = 1 if output_file: column_value_dict.write(output_file) return column_value_dict
def cluster_sequence_names_by_id_fragment(self, seq_id_list, id_element_index, id_separator="_", output_prefix=None): cluster_dict = SynDict() skipped_id_list = IdList() for seq_id in seq_id_list: seq_id_splited = seq_id.split(id_separator) if id_element_index < len(seq_id_splited): if seq_id_list[id_element_index] in cluster_dict: cluster_dict[seq_id_list[id_element_index]].append(seq_id) else: cluster_dict[seq_id_list[id_element_index]] = [seq_id] else: skipped_id_list.append(seq_id) if output_prefix: cluster_dict.write("%s.seqid.clusters" % output_prefix, splited_values=True) skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix) return cluster_dict
def extract_clusters_by_size(cluster_dict, min_cluster_size=None, max_cluster_size=None, white_list_ids=None, out_file=None): filtered_cluster_dict = SynDict() if (min_cluster_size is not None) and (max_cluster_size is not None): def filt_exp(element_list): return True if min_cluster_size <= len( element_list) <= max_cluster_size else False elif max_cluster_size is not None: def filt_exp(element_list): return True if len(element_list) <= max_cluster_size else False elif min_cluster_size is not None: def filt_exp(element_list): return True if min_cluster_size <= len(element_list) else False else: raise ValueError( "ERROR!!! Neither minimum nor maximum cluster size thresholds were set" ) for cluster_id in cluster_dict: if white_list_ids and (cluster_id not in white_list_ids): continue if filt_exp(cluster_dict[cluster_id]): filtered_cluster_dict[cluster_id] = cluster_dict[cluster_id] if out_file: filtered_cluster_dict.write(filename=out_file, splited_values=True) return filtered_cluster_dict
def extract_eggnog_fam_by_protein_syn_dict(self, eggnog_fam_dict, protein_syn_dict, output_prefix=None, species_id=None): extracted_families = SynDict() common_protein_names_to_families_dict = SynDict() common_names_to_eggnog_proteins_syn_dict = SynDict() not_found_proteins_common_names = IdList() transposed_eggnog_fam_dict = eggnog_fam_dict.exchange_key_and_value() for common_protein_name in protein_syn_dict: not_found = True for protein_id in protein_syn_dict[common_protein_name]: extended_protein_id = protein_id if species_id is None else species_id + "." + protein_id if extended_protein_id in transposed_eggnog_fam_dict: not_found = False if common_protein_name not in common_protein_names_to_families_dict: common_protein_names_to_families_dict[common_protein_name] = [transposed_eggnog_fam_dict[extended_protein_id][0]] common_names_to_eggnog_proteins_syn_dict[common_protein_name] = [extended_protein_id] else: common_protein_names_to_families_dict[common_protein_name].append(transposed_eggnog_fam_dict[extended_protein_id][0]) common_names_to_eggnog_proteins_syn_dict[common_protein_name].append(extended_protein_id) if transposed_eggnog_fam_dict[extended_protein_id][0] not in extracted_families: extracted_families[transposed_eggnog_fam_dict[extended_protein_id][0]] = eggnog_fam_dict[transposed_eggnog_fam_dict[extended_protein_id][0]] if not_found: not_found_proteins_common_names.append(common_protein_name) if output_prefix: extracted_families.write(filename="%s.extracted_families.fam" % output_prefix, splited_values=True) common_protein_names_to_families_dict.write(filename="%s.common_protein_names_to_families.correspondence" % output_prefix, splited_values=True) common_names_to_eggnog_proteins_syn_dict.write(filename="%s.common_protein_names_to_eggnog_proteins.correspondence" % output_prefix, splited_values=True) not_found_proteins_common_names.write(filename="%s.not_found.common_names" % output_prefix) #print common_names_to_eggnog_proteins_syn_dict #print common_protein_names_to_families_dict return extracted_families, common_protein_names_to_families_dict, \ common_names_to_eggnog_proteins_syn_dict, not_found_proteins_common_names
accordance_dict[species].read(accordance_file, key_index=1, value_index=0) if args.name_first: def split_name(pep_name): gene_list = pep_name.split(args.name_separator) return gene_list[0], args.name_separator.join(gene_list[1:]) else: def split_name(pep_name): gene_list = pep_name.split(args.name_separator) return gene_list[-1], args.name_separator.join(gene_list[:-1]) families_with_errors = IdSet() for family in pep_fam_dict: cds_fam_dict[family] = [] for pep in pep_fam_dict[family]: species, pep_name = split_name(pep) if pep_name in accordance_dict[species]: cds_name = "%s%s%s" % (species, args.name_separator, accordance_dict[species][pep_name]) if args.name_first else \ "%s%s%s" % (accordance_dict[species][pep_name], args.name_separator, species) cds_fam_dict[family].append(cds_name) else: print("%s %s %s doesn't have associated cds in accordance file" % (family, species, pep_name)) families_with_errors.add(family) for family in families_with_errors: cds_fam_dict.pop(family, None) families_with_errors.write(args.fam_error) cds_fam_dict.write(args.output, splited_values=True)
def add_flanks_to_gff_record(self, input_gff, output_prefix, left_flank_len, right_flank_len, fasta_file, coords_description_entry="core_seq_coords", id_description_entry="ID"): sequence_length_dict = self.get_lengths_from_seq_file(fasta_file) shorter_flanks_dict = SynDict() output_gff = "%s.gff" % output_prefix short_flanks_file = "%s.short_flanks.dat" % output_prefix with open(input_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) continue line_list = line.strip().split("\t") scaffold = line_list[0] start = int(line_list[3]) end = int(line_list[4]) record_id = OrderedDict( map(lambda s: s.split("="), line_list[8].split(";")))[id_description_entry] line_list[8] += ";%s=%i,%i" % (coords_description_entry, start, end) if line_list[6] == "-": if start - right_flank_len > 0: line_list[3] = str(start - right_flank_len) right_flank_length = right_flank_len else: right_flank_length = start - 1 line_list[3] = "1" if end + left_flank_len <= sequence_length_dict[ line_list[0]]: line_list[4] = str(end + left_flank_len) left_flank_length = left_flank_len else: left_flank_length = sequence_length_dict[ line_list[0]] - end line_list[4] = sequence_length_dict[line_list[0]] else: if start - left_flank_len > 0: line_list[3] = str(start - left_flank_len) left_flank_length = left_flank_len else: left_flank_length = start - 1 line_list[3] = "1" if end + right_flank_len <= sequence_length_dict[ line_list[0]]: line_list[4] = str(end + right_flank_len) right_flank_length = right_flank_len else: right_flank_length = sequence_length_dict[ line_list[0]] - end line_list[4] = str( sequence_length_dict[line_list[0]]) if (left_flank_length < left_flank_len) or ( right_flank_length < right_flank_len): print("%s: Short flank" % record_id) shorter_flanks_dict[record_id] = "%i,%i" % ( left_flank_length, right_flank_length) line_list[8] += ";%s_relative=%i,%i\n" % ( coords_description_entry, 1 + (right_flank_length if line_list[6] == "-" else left_flank_length), end - start + 1 + (right_flank_length if line_list[6] == "-" else left_flank_length)) """ print( line) print( line_list) for element in line_list: print (element) print (type(element)) """ out_fd.write("\t".join(line_list)) shorter_flanks_dict.write(short_flanks_file)
sl_keys = list(complicated_families_dict.sl_keys()) for sl_key in sl_keys: sp_set = set() for species in complicated_families_dict: if sl_key not in complicated_families_dict[species]: continue tmp = complicated_families_dict[species][sl_key].split(";") for i in range(0, len(tmp)): if "_" in tmp[i]: tmp[i] = tmp[i][2:] tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) sp_set.add(syn_id) complicated_families_syn_dict[sl_key] = sp_set complicated_families_syn_dict.write("complicated_families_connections.t", splited_values=True) for entry in complicated_families_dict.all_values(): tmp = entry.split(";") for i in range(0, len(tmp)): if "_" in tmp[i]: tmp[i] = tmp[i][2] tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) complicated_families_syn_ids.write("complicated_families_check.ids") nonassembled.write("splited_to_several_families.t", absent_symbol=".") assemled_to_different_families = species_syn_dict.filter_by_line( filter_different_assembly)
parser = argparse.ArgumentParser() parser.add_argument("-i", "--tree_dir", action="store", dest="tree_dir", required=True, type=check_path, help="Directory with trees") parser.add_argument("-f", "--tree_format", action="store", dest="tree_format", default=1, type=int, help="Format of input trees") parser.add_argument("-o", "--output_file", action="store", dest="output_file", default="stdout", help="Output file with leaves of trees. Default: stdout") args = parser.parse_args() out_fd = sys.stdout if args.output_file == "stdout" else open(args.output_file, "w") tree_files_list = os.listdir(args.tree_dir) names_dict = SynDict() for tree_file in tree_files_list: tree_name = split_filename(tree_file)[1] with open("%s%s" % (args.tree_dir, tree_file), "r") as tree_fd: tree = Tree(tree_fd.readline().strip(), format=args.tree_format) leaves_list = [] for node in tree.traverse(): if node.is_leaf(): leaves_list.append(node.name) names_dict[tree_name] = leaves_list names_dict.write(args.outp_fd, splited_values=True) if args.output_file != "stdout": out_fd.close()
def replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8): output_gff = "%s.renamed.gff" % output_prefix genes_syn_file = "%s.gene.syn" % output_prefix transcripts_syn_file = "%s.transcript.syn" % output_prefix cds_syn_file = "%s.cds.syn" % output_prefix genes_syn_dict = SynDict() transcripts_syn_dict = SynDict() cds_syn_dict = SynDict() gene_counter = 0 gene_id_template = "%sG%%0%ii" % (species_prefix, number_of_digits_in_id) transcripts_counter = 0 transcript_id_template = "%sT%%0%ii" % (species_prefix, number_of_digits_in_id) cds_counter = 0 cds_id_template = "%sC%%0%ii" % (species_prefix, number_of_digits_in_id) with open(augustus_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: tmp = line.strip() if len(tmp) < 13: out_fd.write(line) continue if tmp[:12] != "# start gene": out_fd.write(line) continue augustus_gene_id = tmp.split(" ")[-1] gene_counter += 1 gene_syn_id = gene_id_template % gene_counter genes_syn_dict[augustus_gene_id] = gene_syn_id augustus_transcript_id = "" augustus_transcript_parent = "" out_fd.write("# start gene %s\n" % gene_syn_id) tmp = in_fd.readline().strip() while True: while tmp[0] != "#": tmp_list = tmp.split("\t") feature_type = tmp_list[2] edited_str = "\t".join(tmp_list[:-1]) info_field_list = tmp_list[-1].split(";") if feature_type == "gene": edited_str += "\tID=%s\n" % gene_syn_id elif feature_type == "transcript": for entry in info_field_list: if "ID" in entry: augustus_transcript_id = entry.split( "=")[-1] if augustus_transcript_id not in transcripts_syn_dict: transcripts_counter += 1 transcripts_syn_dict[ augustus_transcript_id] = transcript_id_template % transcripts_counter transcript_syn_id = transcripts_syn_dict[ augustus_transcript_id] if "Parent" in entry: augustus_transcript_parent = entry.split( "=")[-1] if augustus_transcript_parent != augustus_gene_id: raise ValueError( "Transcript parent id and gene id are not same!" ) edited_str += "\tID=%s;Parent=%s\n" % ( transcript_syn_id, gene_syn_id) elif feature_type == "CDS": for entry in info_field_list: if "ID" in entry: augustus_cds_id = entry.split("=")[-1] if augustus_cds_id not in cds_syn_dict: cds_counter += 1 cds_syn_dict[ augustus_cds_id] = cds_id_template % cds_counter cds_syn_id = cds_syn_dict[ augustus_cds_id] if "Parent" in entry: augustus_cds_parent = entry.split( "=")[-1] if augustus_cds_parent != augustus_transcript_id: raise ValueError( "CDS parent id and transcript id are not same!" ) edited_str += "\tID=%s;Parent=%s\n" % ( cds_syn_id, transcript_syn_id) elif (feature_type == "stop_codon") or (feature_type == "start_codon"): for entry in info_field_list: if "Parent" in entry: augustus_feature_parent = entry.split( "=")[-1] if augustus_feature_parent != augustus_transcript_id: raise ValueError( "Feature parent id and transcript id are not same!" ) edited_str += "\tParent=%s\n" % transcript_syn_id else: edited_str = tmp + "\n" out_fd.write(edited_str) tmp = in_fd.readline().strip() while tmp[0] == "#": if "# end gene" in tmp: break out_fd.write(tmp + "\n") tmp = in_fd.readline().strip() if "# end gene" in tmp: break out_fd.write("# end gene %s\n" % gene_syn_id) genes_syn_dict.write(genes_syn_file) transcripts_syn_dict.write(transcripts_syn_file) cds_syn_dict.write(cds_syn_file)
action="store", dest="output", default="stdout", help="Output file") parser.add_argument("-k", "--family_column", action="store", dest="fam_col", default=1, type=int, help="Family column position(0-based). Default: 1") parser.add_argument("-a", "--genes_column", action="store", dest="gen_col", default=0, type=int, help="Genes column position(0-based). Default: 0") args = parser.parse_args() hit_dict = SynDict() hit_dict.read(args.input, header=args.header, allow_repeats_of_key=True, key_index=args.fam_col, value_index=args.gen_col) hit_dict.write(args.output, splited_values=True)
os.system(exe_string) os.system(awk_extract_ids_string % (pep_uniq_description_file, pep_uniq_ids)) syn_dict = SynDict() syn_dict.read(pep_uniq_description_no_isoform_versions, header=False, separator="\t", allow_repeats_of_key=True, split_values=True, values_separator=",", key_index=1, value_index=0, comments_prefix="#") syn_dict.write(pep_description_collapsed_isoforms, splited_values=True, values_separator=",") length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input, format="fasta", out_file=len_file) descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w") descr_longest_isoform_fd = open(pep_description_longest_isoform, "w") descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w") for gene in syn_dict: len_list = [] longest_isoform = None max_len = 0 for isoform_id in syn_dict[gene]:
def extract_hits_from_tbl_output(blast_hits, output_file): hits = SynDict() hits.read(blast_hits, allow_repeats_of_key=True, key_index=0, value_index=1, separator="\t") hits.write(output_file, splited_values=True, separator="\t", values_separator=",") return hits
out_fd = sys.stdout if args.output_prefix == "stdout" else open( "%s_reference_random_genes.ids" % args.output_prefix, "w") reference_families = SynDict() reference_families.read(args.reference_fam, separator="\t", split_values=True, values_separator=",") node_family_ids = IdList() node_family_ids.read(args.input, header=True, column_number=0, column_separator="\t") reference_random_genes = SynDict() for family_id in node_family_ids: if family_id not in reference_families: reference_random_genes[family_id] = "." else: reference_random_genes[family_id] = choice( reference_families[family_id]) reference_random_genes.write("%s_reference_random_genes.t" % args.output_prefix) for family_id in reference_random_genes: if reference_random_genes[family_id] != ".": out_fd.write("%s\n" % reference_random_genes[family_id])
"--remove_value_repeats", action="store_true", dest="remove_value_repeats", help="Remove repeats of values") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") syn_dict = SynDict() syn_dict.read(args.input, header=False, separator=args.column_separator, allow_repeats_of_key=True, split_values=True, values_separator=args.value_separator, key_index=args.key_column, value_index=args.value_column, comments_prefix=args.comments_prefix) if args.remove_value_repeats: collapsed_dict = syn_dict.remove_value_repeats() collapsed_dict.write(out_fd, splited_values=True, values_separator=args.value_separator, close_after_if_file_object=True) else: syn_dict.write(out_fd, splited_values=True, values_separator=args.value_separator, close_after_if_file_object=True) #out_fd.close()