def get_transcript_to_pep_accordance_from_gtf(gtf_file, output_file, comment_symbol="#"): """ Tested on gtf files from Ensembl relealese 70 """ accordance_dict = SynDict() with open(gtf_file, "r") as gtf_fd: for line in gtf_fd: if line[0] == comment_symbol: continue tmp_list = line.strip().split("\t") tmp_list = tmp_list[-1].split(";") protein_id = None transcript_id = None #print tmp_list for entry in tmp_list: tmp_entry = entry.split() if len(tmp_entry) != 2: continue if tmp_entry[0] == "transcript_id": #print "tttt" transcript_id = tmp_entry[1][1:-1] # remove quotes elif tmp_entry[0] == "protein_id": #print "ppppp" protein_id = tmp_entry[1][1:-1] if (transcript_id is not None) and (protein_id is not None): if transcript_id in accordance_dict: accordance_dict[transcript_id].add(protein_id) else: accordance_dict[transcript_id] = {protein_id} accordance_dict.write(output_file, splited_values=True)
def correct_regions_from_gff( self, reference, variants_vcf, gff_file, output_prefix=None, feature_type_list=["CDS"], unification_key="Parent", #raw_seq_per_line=False, vcf_with_masking=None, override_vcf_by_mask=None, use_ambiguous_nuccleotides=None): feature_dict = AnnotationsRoutines.get_feature_dict( gff_file, output_prefix=output_prefix, feature_type_list=feature_type_list, unification_key=unification_key) region_file = "%s.coordinates_only.list" % output_prefix raw_regions = "%s.raw.seq" % output_prefix final_regions = "%s.fasta" % output_prefix regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix self.correct_reference( reference, raw_regions, variants_vcf, raw_seq_per_line=True, vcf_with_masking=vcf_with_masking, override_vcf_by_mask=override_vcf_by_mask, use_ambiguous_nuccleotides=use_ambiguous_nuccleotides, interval_list=region_file) region_with_frameshift = SynDict() def new_regions_generator(): with open(raw_regions, "r") as in_fd: for region_id in feature_dict: seq = "" for i in range(0, len(feature_dict[region_id])): seq_fragment = in_fd.readline().strip() if ((int(feature_dict[region_id][i][2]) - int(feature_dict[region_id][i][1]) + 1) - len(seq_fragment)) % 3 != 0: if region_id not in region_with_frameshift: region_with_frameshift[region_id] = [i] else: region_with_frameshift[region_id].append(i) seq += seq_fragment yield SeqRecord( seq=Seq(seq) if feature_dict[region_id][0][3] == "+" else Seq(seq).reverse_complement(), id=region_id, description="") SeqIO.write(new_regions_generator(), final_regions, format="fasta") region_with_frameshift.write(regions_with_frameshift_file, splited_values=True)
def convert_emapper_annotation_file_to_fam(emapper_annotation_file, output_fam, eggnogdb_prefix=None, species_name=None, label_separator="."): fam_dict = SynDict() with open(emapper_annotation_file, "r") as annotations_fd: for line in annotations_fd: if line[0] == "#": continue line_list = line.split("\t") fam_id = line_list[10].split("|")[0] if not (eggnogdb_prefix is None): fam_id = eggnogdb_prefix + fam_id gene_id = "%s%s%s" % ( species_name, label_separator, line_list[0]) if species_name else line_list[0] if fam_id in fam_dict: fam_dict[fam_id].append(gene_id) else: fam_dict[fam_id] = [gene_id] fam_dict.write(filename=output_fam, splited_values=True)
def count_per_scaffold_feature_number(gff_file, out_file=None, feature_type_list=[]): feature_count_dict = SynDict() if feature_type_list: def check_feature_type(feature_type): return feature_type in feature_type_list else: def check_feature_type(feature_type): return True with open(gff_file, "r") as gff_fd: for line in gff_fd: if line[0] == "#": continue line_list = line.split("\t") if check_feature_type(line_list[2]): if line_list[0] in feature_count_dict: feature_count_dict[line_list[0]] += 1 else: feature_count_dict[line_list[0]] = 1 if out_file: feature_count_dict.write(out_file) return feature_count_dict
def prepare_data_for_target_alignment(self, query_fasta, target_fasta, correspondence_file, out_dir, correspondence_query_column=0, correspondence_target_column=1): query_dict = self.parse_seq_file(query_fasta, "parse") target_dict = self.parse_seq_file(target_fasta, "parse") self.safe_mkdir(out_dir) correspondence_dict = SynDict(filename=correspondence_file, allow_repeats_of_key=True, key_index=correspondence_query_column, value_index=correspondence_target_column) for query_id in correspondence_dict: query_outfile = "%s/%s.query.fasta" % (out_dir, query_id) target_outfile = "%s/%s.target.fasta" % (out_dir, query_id) SeqIO.write(self.record_by_id_generator(query_dict, [query_id]), query_outfile, format="fasta") SeqIO.write(self.record_by_id_generator( target_dict, correspondence_dict[query_id]), target_outfile, format="fasta") queries_with_targets_set = set(correspondence_dict.keys()) queries_set = set(query_dict.keys()) return queries_with_targets_set, queries_set - queries_with_targets_set
def get_codon_alignment_from_files(self, protein_aln_file, nucleotide_seq_file, codon_alignment_file, cds2protein_accordance_file=None, alignment_format="fasta", nucleotide_sequence_format="fasta", cds_index_file=None, retain_cds_index=False): protein_aln_dict = AlignIO.read(protein_aln_file, format=alignment_format) nucleotide_seq_dict = SeqIO.index_db( cds_index_file if cds_index_file else "nuc_tmp.idx", nucleotide_seq_file, format=nucleotide_sequence_format) protein2cds_accordance_dict = None if cds2protein_accordance_file: protein2cds_accordance_dict = SynDict() protein2cds_accordance_dict.read(cds2protein_accordance_file, key_index=1, value_index=0) self.get_codon_alignment( protein_aln_dict, nucleotide_seq_dict, codon_alignment_file, protein2cds_accordance_dict=protein2cds_accordance_dict) if (not cds_index_file) and (not retain_cds_index): os.remove("nuc_tmp.idx")
def add_length_to_accordance_file(accordance_file, length_file, output_prefix): accordance_dict = SynDict(filename=accordance_file, allow_repeats_of_key=True) length_dict = SynDict(filename=length_file, expression=int) print length_dict longest_list = IdList() all_output_file = "%s.all.correspondence" % output_prefix longest_output_file = "%s.longest.correspondence" % output_prefix longest_id_file = "%s.longest.ids" % output_prefix with open(all_output_file, "w") as all_out_fd: with open(longest_output_file, "w") as longest_out_fd: for gene in accordance_dict: current_transcript = None current_length = 0 for transcript in accordance_dict[gene]: if length_dict[transcript] > current_length: current_transcript = transcript current_length = length_dict[transcript] all_out_fd.write("%s\t%s\t%i\n" % (gene, transcript, length_dict[transcript])) longest_out_fd.write("%s\t%s\t%i\n" % (gene, current_transcript, current_length)) longest_list.append(current_transcript) longest_list.write(longest_id_file)
def count_column_values_from_file(self, input_file, column_number, output_file=None, separator="\t", comments_prefix="#", verbose=False): column_value_dict = SynDict() for line_list in self.file_line_as_list_generator( input_file, separator=separator, comments_prefix=comments_prefix): if line_list[column_number] in column_value_dict: column_value_dict[line_list[column_number]] += 1 else: column_value_dict[line_list[column_number]] = 1 if output_file: column_value_dict.write(output_file) if verbose: print("#Column %i (0-based) contains %i different values" % (column_number, len(column_value_set))) return column_value_dict
def extract_sequences_from_selected_clusters( self, clusters_id_file, cluster_file, seq_file, output_dir="./", seq_format="fasta", out_prefix=None, create_dir_for_each_cluster=False, skip_cluster_if_no_sequence_for_element=True): from Routines import SequenceRoutines, FileRoutines cluster_id_list = IdList() cluster_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster if clusters_id_file: cluster_id_list.read(clusters_id_file) cluster_dict.read(cluster_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db( "tmp.idx", FileRoutines.make_list_of_path_to_files(seq_file), format=seq_format) number_of_skipped_clusters = 0 for fam_id in cluster_id_list if clusters_id_file else cluster_dict: if skip_cluster_if_no_sequence_for_element: absent_elements = self.check_absence_of_cluster_elements( cluster_dict[fam_id], protein_dict) if absent_elements: print "Skipping cluster %s due to absent element(%s)" % ( fam_id, ",".join(absent_elements)) number_of_skipped_clusters += 1 continue if fam_id in cluster_dict: if create_directory_for_each_cluster: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.fasta" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.fasta" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, cluster_dict[fam_id], verbose=True), out_file, format=seq_format) os.remove("tmp.idx") print "%i of %i clusters were skipped due to absent elements" % ( number_of_skipped_clusters, len(cluster_dict)) return number_of_skipped_clusters
def find_leaves_with_positive_selection(self, write=True): leaf_values_dict = self.get_leaf_values(write=False) positive_selected_leaves_dict = SynDict() for leaf_name in leaf_values_dict["W"]: if leaf_values_dict["W"][leaf_name] > 1: positive_selected_leaves_dict[leaf_name] = leaf_values_dict["W"][leaf_name] if write: positive_selected_leaves_dict.write("leaves_with_positive_selection.t") return positive_selected_leaves_dict
def get_monomer_len_file_from_trf_gff(trf_gff, len_file): len_dict = SynDict() with open(trf_gff, "r") as trf_fd: for line in trf_fd: if line[0] == "#": continue description_dict = AnnotationsRoutines.get_description_dict_from_gff_string( line) len_dict[description_dict["ID"]] = description_dict["Period"] # print len_dict len_dict.write(len_file)
def add_len_to_simple_output(top_hits_simple, len_file, out_file): len_dict = SynDict() len_dict.read(len_file) with open(top_hits_simple, "r") as in_fd: with open(out_file, "w") as out_fd: for line in in_fd: tmp_list = line.strip().split("\t") out_fd.write( "%s\t%s\t%s\t%s\t%s\t%f\n" % (tmp_list[0], len_dict[tmp_list[0]], tmp_list[3], tmp_list[1], tmp_list[2], (float(tmp_list[2]) - float(tmp_list[1]) + 1) / float(len_dict[tmp_list[0]])))
def extract_GO_terms_from_emapper_annotation_file(emapper_annotation_file, output_file): GO_terms_dict = SynDict(filename=emapper_annotation_file, key_index=0, value_index=5, split_values=True, values_separator=",", comments_prefix="#", separator="\t") GO_terms_dict.header = "#protein_id\tGO_terms" GO_terms_dict.write(output_file, header=True, splited_values=True) return GO_terms_dict
def count_unique_positions_per_sequence_from_file(self, alignment_file, output_prefix, format="fasta", gap_symbol="-", return_mode="absolute", verbose=True): alignment = AlignIO.read(alignment_file, format=format) number_of_sequences = len(alignment) alignment_length = len(alignment[0]) position_presence_matrix = self.get_position_presence_matrix( alignment, gap_symbol=gap_symbol, verbose=verbose) unique_position_count_dict = SynDict() unique_position_count_percent_dict = SynDict() for row in range(0, number_of_sequences): sequence_id = alignment[row].id unique_positions = 0 for column in range(0, alignment_length): if (position_presence_matrix[row, column] == 1) or (position_presence_matrix[row, column] == -1): unique_positions += 1 unique_position_count_dict[sequence_id] = unique_positions unique_position_count_percent_dict[sequence_id] = 100 * float( unique_positions) / (alignment_length - str(alignment[row].seq).count(gap_symbol)) unique_position_count_dict.write("%s.absolute_counts" % output_prefix) unique_position_count_percent_dict.write("%s.percent_counts" % output_prefix) return unique_position_count_dict if return_mode == "absolute" else unique_position_count_percent_dict
def replace_region_names_in_gff(input_gff, synonyms_file, output_gff): syn_dict = SynDict() syn_dict.read(synonyms_file, comments_prefix="#") with open(input_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) else: line_list = line.split("\t") if line_list[0] in syn_dict: line_list[0] = syn_dict[line_list[0]] out_fd.write("\t".join(line_list)) else: out_fd.write(line)
def merge_clusters(clusters_dict, label_species="False", separator_for_labeling="_", species_label_first=True): if species_label_first: label_sequence = lambda label, name: "%s%s%s" % ( label, separator_for_labeling, name) else: label_sequence = lambda label, name: "%s%s%s" % ( name, separator_for_labeling, label) if label_species: expression = label_sequence else: expression = lambda label, name: name merged_clusters = SynDict() for species in clusters_dict: for cluster in clusters_dict[species]: if cluster not in merged_clusters: merged_clusters[cluster] = [] for sequence_name in clusters_dict[species][cluster]: merged_clusters[cluster].append( expression(species, sequence_name)) return merged_clusters
def extract_single_copy_clusters_from_files( self, list_of_cluster_files, output_file, label_elements=False, separator="@", label_position="first", function_to_convert_filename_to_label=None): dict_of_cluster_dicts = OrderedDict() for filename in list_of_cluster_files: if function_to_convert_filename_to_label: label = function_to_convert_filename_to_label(filename) else: label = FileRoutines.split_filename(filename)[ 1] # use basename as label dict_of_cluster_dicts[label] = SynDict() dict_of_cluster_dicts[label].read(filename, split_values=True, comments_prefix="#") sc_clusters_dict = self.extract_single_copy_clusters( dict_of_cluster_dicts, label_elements=label_elements, separator=separator, label_position=label_position) sc_clusters_dict.write(output_file, splited_values=True) return sc_clusters_dict
def replace_label(cluster_dict, syn_dict=None, old_separator="@", old_label_position="first", new_separator="@", new_label_position="first"): new_cluster_dict = SynDict() for cluster in cluster_dict: new_cluster_dict[cluster] = [] for element in cluster_dict[cluster]: tmp = element.split(old_separator) if old_label_position == "first": label = tmp[0] element_id = old_separator.join(tmp[1:]) else: label = tmp[-1] element_id = old_separator.join(tmp[:-1]) if new_label_position == 'first': new_cluster_dict[cluster].append( "%s%s%s" % (syn_dict[label] if syn_dict else label, new_separator, element_id)) else: new_cluster_dict[cluster].append( "%s%s%s" % (element_id, new_separator, syn_dict[label] if syn_dict else label)) return new_cluster_dict
def split_proteins_per_species(self, dir_with_proteins, output_dir, input_format="fasta", output_format="fasta"): #print type(FileRoutines) input_files = self.make_list_of_path_to_files([dir_with_proteins] if isinstance(dir_with_proteins, str) else dir_with_proteins) out_dir = self.check_path(output_dir) self.safe_mkdir(out_dir) protein_dict = SeqIO.index_db("temp.idx", input_files, format=input_format) syn_dict = SynDict() for protein_id in protein_dict: taxa_id = protein_id.split(".")[0] # pep_id = ".".join(tmp_list[1:]) if taxa_id not in syn_dict: syn_dict[taxa_id] = [] syn_dict[taxa_id].append(protein_id) def renamed_records_generator(record_dict, taxa_id): for record_id in syn_dict[taxa_id]: record = deepcopy(record_dict[record_id]) #print(record) record.id = ".".join(record_id.split(".")[1:]) yield record for taxa_id in syn_dict: out_file = "%s%s.pep" % (out_dir, taxa_id) SeqIO.write(renamed_records_generator(protein_dict, taxa_id), out_file, format=output_format)
def extract_clusters_by_element_ids_from_file(self, cluster_file, element_file, output_file, mode="w", cluster_column=0, element_column=1, column_separator="\t", element_separator=",", id_column=None): """" mode: "w" - if elements from element_id_list are present in cluster extracts only that elements "a" - if elements from element_id_list are present in cluster extracts all elements """ cluster_dict = SynDict(filename=cluster_file, split_values=True, comments_prefix="#", key_index=cluster_column, value_index=element_column, separator=column_separator, values_separator=element_separator) element_id_list = IdList(filename=element_file, comments_prefix="#", column_number=id_column) extracted_clusters = self.extract_clusters_by_element_ids( cluster_dict, element_id_list, mode=mode) extracted_clusters.write(output_file, splited_values=True)
def calculate_fpkm_for_count_table(count_table_file, transcript_length_file, output_file, separator="\t"): length_dict = SynDict(filename=transcript_length_file, expression=int, comments_prefix="#") with open(count_table_file, "r") as in_fd: header_list = in_fd.readline().strip().split(separator) samples_list = header_list[1:] gene_list = IdList() count_list = [] for line in in_fd: tmp = line.strip().split(separator) gene_list.append(tmp[0]) count_list.append(map(float, tmp[1:])) per_sample_total_counts = [] for sample_index in range(0, len(samples_list)): total_counts = 0 for gene_index in range(0, len(count_list)): total_counts += count_list[gene_index][sample_index] per_sample_total_counts.append(total_counts) with open(output_file, "w") as out_fd: out_fd.write(separator.join(header_list) + "\n") for gene_index in range(0, len(count_list)): normalized_counts_list = [] for sample_index in range(0, len(samples_list)): gene_count = count_list[gene_index][sample_index] * (10**9) / length_dict[gene_list[gene_index]] / per_sample_total_counts[sample_index] normalized_counts_list.append(gene_count) out_fd.write("%s\t%s\n" % (gene_list[gene_index], "\t".join(map(str, normalized_counts_list))))
def rename_scaffolds_in_gff(self, input_gff, syn_file, output_prefix, verbose=True): syn_dict = SynDict(filename=syn_file) skipped_id_list = IdSet() output_gff = "%s.renamed.gff" % output_prefix skipped_gff = "%s.skipped.gff" % output_prefix skipped_id_file = "%s.skipped_scaffolds.ids" % output_prefix with self.metaopen(input_gff, "r") as in_fd, \ self.metaopen(output_gff, "w") as out_fd, \ self.metaopen(skipped_gff, "w") as skipped_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) gff_list = line.split("\t") if gff_list[0] in syn_dict: gff_list[0] = syn_dict[gff_list[0]] out_fd.write("\t".join(gff_list)) else: skipped_fd.write(line) skipped_id_list.add(gff_list[0]) if verbose: print("Not renamed scaffolds: %i" % len(skipped_id_list)) skipped_id_list.write(skipped_id_file)
def parse_regions(self, input_file, format="gff", comment_prefix="#", separator="\t", bed_format="0-based"): region_dict = SynDict() # All coordinates are converted to 0-based in python notation if format == "gff": for line in self.file_line_as_list_generator(input_file, comments_prefix="#", separator="\t"): if line[self.GFF_SCAFFOLD_COLUMN] not in region_dict: region_dict[line[self.GFF_SCAFFOLD_COLUMN]] = [[int(line[self.GFF_START_COLUMN]) - 1, int(line[self.GFF_END_COLUMN])]] else: region_dict[line[self.GFF_SCAFFOLD_COLUMN]].append([int(line[self.GFF_START_COLUMN]) - 1, int(line[self.GFF_END_COLUMN])]) elif format == "bed": for line in self.file_line_as_list_generator(input_file, comments_prefix="#", separator="\t"): if line[self.BED_SCAFFOLD_COLUMN] not in region_dict: region_dict[line[self.BED_SCAFFOLD_COLUMN]] = [[(int(line[self.BED_START_COLUMN]) - 1) if bed_format == "1-based" else int(line[self.BED_START_COLUMN]), int(line[self.BED_END_COLUMN])]] else: region_dict[line[self.BED_SCAFFOLD_COLUMN]].append([(int(line[self.BED_START_COLUMN]) - 1) if bed_format == "1-based" else int(line[self.BED_START_COLUMN]), int(line[self.BED_END_COLUMN])]) elif format == "gatk": pass return region_dict
def add_gene_synonyms(self, input_file, output_file, synonym_file, key_column=0, value_column=1, header_name_for_synonym="Common_name", snpeff_tab_column_id_column=8): synonym_dict = SynDict(filename=synonym_file, key_index=key_column, value_index=value_column, comments_prefix="#") #print synonym_dic with open(input_file, "r") as in_fd, open(output_file, "w") as out_fd: header = in_fd.readline().strip( ) + "\t%s\n" % header_name_for_synonym out_fd.write(header) for line in in_fd: tmp = line.strip().split("\t") #print tmp gene_name = tmp[snpeff_tab_column_id_column] #print gene_name #print gene_name #print synonym_dict[gene_name] #print gene_name #if gene_name in synonym_dict: # print "AAAAAAAA" tmp.append(synonym_dict[gene_name] if gene_name in synonym_dict else "") out_fd.write("\t".join(tmp) + "\n")
def add_add_new_column_by_key_column(self, table_file, syn_dict_file, key_column, output_file, new_column_name=None, separator='\t', absent_value="."): column_syn_dict = SynDict(filename=syn_dict_file, allow_repeats_of_key=True, values_separator="@") with open(table_file, "r") as in_fd, open(output_file, "w") as out_fd: if new_column_name: header_line = in_fd.readline().strip( ) + "\t%s\n" % new_column_name out_fd.write(header_line) for line in in_fd: line_list = line.strip().split(separator) if line_list[key_column] in column_syn_dict: print(line_list[key_column]) print(column_syn_dict[line_list[key_column]]) line_list.append( absent_value if line_list[key_column] not in column_syn_dict else "|". join(column_syn_dict[line_list[key_column]])) out_fd.write(separator.join(line_list) + "\n")
def replace_column_value_by_syn(input_file, syn_file, out_file, column=0, comment_prefix=None, separator="\t", syn_header=False, syn_separator="\t", syn_key_index=0, syn_value_index=1, syn_comment_prefix=None): syn_dict = SynDict(filename=syn_file, header=syn_header, separator=syn_separator, key_index=syn_key_index, value_index=syn_value_index, comments_prefix=syn_comment_prefix) if comment_prefix: comment_prefix_len = len(comment_prefix) line_number = 0 replaced = 0 not_replaced = 0 with open(input_file, "r") as in_fd: with open(out_file, "w") as out_fd: for line in in_fd: line_number += 1 if comment_prefix: if line[0:comment_prefix_len] == comment_prefix: out_fd.write(line) continue line_list = line.strip("\n").split(separator) if len(line_list) < column + 1: sys.stderr.write("WARNING!!! Line %i doesn't have column %i\n" % (line_number, column)) if line_list[column] in syn_dict: replaced += 1 line_list[column] = syn_dict[line_list[column]] else: not_replaced += 1 out_fd.write(separator.join(line_list)) out_fd.write("\n") sys.stderr.write("Replaced: %i\nNot replaced: %i\n" % (replaced, not_replaced))
def extract_predicted_gene_names_from_emapper_annotation_file( emapper_annotation_file, output_file): extract_predicted_gene_names_dict = SynDict( filename=emapper_annotation_file, key_index=0, value_index=4, split_values=True, values_separator=",", comments_prefix="#", separator="\t") extract_predicted_gene_names_dict.header = "#protein_id\tpredicted_gene_name" extract_predicted_gene_names_dict.write(output_file, header=True, splited_values=True) return extract_predicted_gene_names_dict
def get_taxonomy(taxa_list, output_file, email, input_type="latin"): Entrez.email = email out_file = open(output_file, "w") out_file.write("#species\tlineage\n") species_syn_dict = SynDict() if input_type == "latin": for taxon in taxa_list: print "Handling %s" % taxon summary = Entrez.read(Entrez.esearch(db="taxonomy", term=taxon)) if summary: id_list = summary["IdList"] species_syn_dict[taxon] = [] for id in id_list: print "handling %s" % id record = Entrez.read(Entrez.efetch(db="taxonomy", id=id, retmode="xml")) #print record out_file.write("%s\t%s\t%s\n" % (taxon, record[0]["Rank"], record[0]["Lineage"])) species_syn_dict[taxon].append(record[0]['ScientificName']) #species_set.add(record[0]["Species"]) elif input_type == "id": for taxon in taxa_list: print "Handling %s" % taxon species_syn_dict[taxon] = [] #print taxon record = Entrez.read(Entrez.efetch(db="taxonomy", id=taxon, retmode="xml")) #print record out_file.write("%s\t%s\t%s\n" % (taxon, record[0]["Rank"], record[0]["Lineage"])) #print record[0] species_syn_dict[taxon].append(record[0]['ScientificName']) #species_set.add(record[0]["Species"]) return species_syn_dict
def get_species_from_eggnog_tsv(self, eggnog_tsv, output_prefix, email=None): cluster_dict = SynDict(filename=eggnog_tsv, key_index=1, value_index=5, split_values=True) species_ids = self.extract_labels_from_cluster_elements( cluster_dict, separator=".", label_position="first") if not email: species = species_ids else: species = NCBIRoutines.get_taxonomy(species_ids, "%s.species.taxonomy" % output_prefix, email, input_type="id") species.write("%s.species" % output_prefix, splited_values=True) for species_id in species: for i in range(0, len(species[species_id])): species[species_id][i] = species[species_id][i].lower( ).replace(" ", "_") species.write("%s.replaced_spaces.species" % output_prefix, splited_values=True)
def label_cluster_elements_from_file(self, input_file, label, output_file, separator="@", label_position="first"): input_dict = SynDict() input_dict.read(input_file, split_values=True, comments_prefix="#") output_dict = self.label_cluster_elements( input_dict, label, separator=separator, label_position=label_position) output_dict.write(output_file, splited_values=True) return output_dict