def convert_rm_out_to_gff(input_file, output_file, annotated_repeat_classes_file, annotated_repeat_families_file): repeat_classes_set = IdSet() repeat_families_set = IdSet() with open(input_file, "r") as in_fd: for i in range(0, 3): in_fd.readline() with open(output_file, "w") as out_fd: for line in in_fd: tmp = line.strip().split() strand = "+" if tmp[8] == "+" else "-" repeat_class_family = tmp[10].split("/") if len(repeat_class_family) == 1: repeat_class_family.append(".") repeat_classes_set.add(repeat_class_family[0]) repeat_families_set.add("/".join(repeat_class_family)) parameters = "Class=%s;Family=%s;Matching_repeat=%s;SW_score=%s;Perc_div=%s;Perc_del=%s;Pers_ins=%s" \ % (repeat_class_family[0], repeat_class_family[1], tmp[9], tmp[0], tmp[1], tmp[2], tmp[3]) out_fd.write( "%s\tRepeatMasker\trepeat\t%s\t%s\t.\t%s\t.\t%s\n" % (tmp[4], tmp[5], tmp[6], strand, parameters)) repeat_classes_set.write(annotated_repeat_classes_file) repeat_families_set.write(annotated_repeat_families_file)
def intersect_ids_from_files(files_with_ids_from_group_a, files_with_ids_from_group_b, result_file=None, mode="common"): a = IdSet() b = IdSet() if mode == "common": expression = lambda a, b: a & b elif mode == "only_a": expression = lambda a, b: a - b elif mode == "only_b": expression = lambda a, b: b - a elif mode == "not_common": expression = lambda a, b: a ^ b elif mode == "combine": expression = lambda a, b: a | b #print(files_with_ids_from_group_a) for filename in [files_with_ids_from_group_a] if isinstance(files_with_ids_from_group_a, str) else files_with_ids_from_group_a: id_set = IdSet() id_set.read(filename, comments_prefix="#") a = a | id_set for filename in [files_with_ids_from_group_b] if isinstance(files_with_ids_from_group_b, str) else files_with_ids_from_group_b: id_set = IdSet() id_set.read(filename, comments_prefix="#") b = b | id_set result_fd = open(result_file, "w") if result_file else sys.stdout if mode != "count": final_set = IdSet(expression(a, b)) final_set.write(result_fd) else: result_fd.write("Group_A\t%i\nGroup_B\t%i\nCommon\t%i\nOnly_group_A\t%i\nOnly_group_B\t%i\nNot_common\t%i\nAll\t%i\n" % (len(a), len(b), len(a & b), len(a - b), len(b - a), len(a ^ b), len(a | b)))
def extract_monocluster_ids(self, clusters_dict, white_list_ids=None, out_file=None): """ Extracts clusters with only one sequence in all species. """ monocluster_ids = IdSet() cluster_names = self.get_cluster_names(clusters_dict) for cluster_name in cluster_names: for species in clusters_dict: if white_list_ids: if cluster_name not in white_list_ids: break if cluster_name not in clusters_dict[species]: break if len(clusters_dict[species][cluster_name]) > 1: break else: monocluster_ids.add(cluster_name) if out_file: monocluster_ids.write(out_file) return monocluster_ids
def rename_scaffolds_in_gff(self, input_gff, syn_file, output_prefix, verbose=True): syn_dict = SynDict(filename=syn_file) skipped_id_list = IdSet() output_gff = "%s.renamed.gff" % output_prefix skipped_gff = "%s.skipped.gff" % output_prefix skipped_id_file = "%s.skipped_scaffolds.ids" % output_prefix with self.metaopen(input_gff, "r") as in_fd, \ self.metaopen(output_gff, "w") as out_fd, \ self.metaopen(skipped_gff, "w") as skipped_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) gff_list = line.split("\t") if gff_list[0] in syn_dict: gff_list[0] = syn_dict[gff_list[0]] out_fd.write("\t".join(gff_list)) else: skipped_fd.write(line) skipped_id_list.add(gff_list[0]) if verbose: print("Not renamed scaffolds: %i" % len(skipped_id_list)) skipped_id_list.write(skipped_id_file)
def get_cluster_names(clusters_dict, out_file=None, white_list_ids=None): cluster_names = IdSet() for species in clusters_dict: species_clusters = IdSet(clusters_dict[species].keys()) cluster_names |= species_clusters if out_file: cluster_names.write(out_file) return cluster_names & IdSet( white_list_ids) if white_list_ids else cluster_names
def get_scaffold_ids_from_gff(gff_file, out_file=None): scaffold_id_set = IdSet() with open(gff_file, "r") as gff_fd: for line in gff_fd: if line[0] == "#": continue scaffold_id = line.split("\t")[0] scaffold_id_set.add(scaffold_id) if out_file: scaffold_id_set.write(out_file) return scaffold_id_set
def get_column_value_set_from_file(self, input_file, column_number, output_file=None, separator="\t", comments_prefix="#", verbose=False): column_value_set = IdSet([ line_list[column_number] for line_list in self.file_line_as_list_generator(input_file, separator=separator, comments_prefix=comments_prefix) ]) if output_file: column_value_set.write(output_file) if verbose: print("#Column %i (0-based) contains %i different values" % (column_number, len(column_value_set))) return column_value_set
def predict_genes(self, output_prefix, annotation_species_prefix, genome_fasta, augustus_species, output_directory="./", augustus_strand=None, augustus_gene_model=None, augustus_config_dir=None, augustus_use_softmasking=None, augustus_other_options="", augustus_hintsfile=None, augustus_extrinsicCfgFile=None, augustus_predict_UTR=None, augustus_min_intron_len=None, threads=1, augustus_dir="", hmmer_dir="", blast_dir="", stop_codons_list=("TGA", "TAA", "TAG"), genetic_code_table=1): draft_file_prefix = "%s/raw/%s" % (output_directory, output_prefix) augustus_splited_input_dir = "%s/splited_input/" % output_directory augustus_splited_output_dir = "%s/splited_output_dir" % output_directory output_raw_gff = "%s.raw.gff" % draft_file_prefix output_gff = "%s.renamed.gff" % draft_file_prefix augustus_pep = "%s.pep" % draft_file_prefix AUGUSTUS.path = augustus_dir AUGUSTUS.threads = threads HMMER3.path = hmmer_dir HMMER3.threads = threads BLASTp.path = blast_dir BLASTp.threads = threads print("Annotating genes...") AUGUSTUS.parallel_predict( augustus_species, genome_fasta, output_raw_gff, strand=augustus_strand, gene_model=augustus_gene_model, output_gff3=True, other_options=augustus_other_options, config_dir=augustus_config_dir, use_softmasking=augustus_use_softmasking, hints_file=augustus_hintsfile, split_dir=augustus_splited_input_dir, splited_output_dir=augustus_splited_output_dir, extrinsicCfgFile=augustus_extrinsicCfgFile, predict_UTR=augustus_predict_UTR, combine_output_to_single_file=True, min_intron_len=augustus_min_intron_len) #replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8): AUGUSTUS.replace_augustus_ids(output_raw_gff, draft_file_prefix, species_prefix=annotation_species_prefix, number_of_digits_in_id=8) #extract_transcript_sequences(self, input_gff_file, genomic_fasta_file, output_prefix, coding_only=False) gffread_file_prefix = "%s.gffread" % draft_file_prefix gffread_transcripts_file, gffread_cds_file, gffread_pep_file = Gffread.extract_transcript_sequences( output_gff, genome_fasta, gffread_file_prefix) gffread_trimmed_cds = ".".join( gffread_cds_file.split(".")[:-1]) + ".trimmed.cds" gffread_trimmed_pep = ".".join( gffread_pep_file.split(".")[:-1]) + ".trimmed.pep" self.trim_cds_and_remove_terminal_stop_codons( gffread_cds_file, gffread_trimmed_cds, stop_codons_list=stop_codons_list ) # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins inframe_stop_codons_file_prefix = "%s.inframe_stop_codon" % draft_file_prefix self.translate_sequences_from_file( gffread_trimmed_cds, gffread_trimmed_pep, format="fasta", id_expression=None, genetic_code_table=genetic_code_table, translate_to_stop=False, prefix_of_file_inframe_stop_codons_seqsin= inframe_stop_codons_file_prefix) # Universal code !!! AUGUSTUS.extract_gene_ids_from_output(output_gff, all_annotated_genes_ids) AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff) print("Extracting peptides...") AUGUSTUS.extract_proteins_from_output( output_gff, output_pep, id_prefix="", evidence_stats_file=output_evidence_stats, supported_by_hints_file=output_supported_stats) self.compare_sequences_from_files(output_pep, "%s.trimmed.pep" % args.output, "comparison_of_peptides", format="fasta", verbose=True) os.system("awk -F'\\t' 'NR==1 {}; NR > 1 {print $2}' %s > %s" % (output_supported_stats, output_supported_stats_ids)) print("Annotating domains(Pfam database)...") HMMER3.parallel_hmmscan( args.pfam_db, output_pep, output_hmmscan, num_of_seqs_per_scan=None, split_dir="splited_hmmscan_fasta/", splited_output_dir="splited_hmmscan_output_dir", tblout_outfile=None, domtblout_outfile=output_domtblout, pfamtblout_outfile=None, splited_tblout_dir=None, splited_domtblout_dir="hmmscan_domtblout/") HMMER3.extract_dom_ids_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_ids) hits_dict = HMMER3.extract_dom_names_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_pfam_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_pfam_supported_transcripts_ids, output_pfam_supported_genes_ids) os.system(remove_transcript_ids_str) print("Annotating peptides(Swissprot database)...") BLASTp.parallel_blastp(output_pep, args.swissprot_db, evalue=0.0000001, output_format=6, outfile=output_swissprot_blastp_hits, split_dir="splited_blastp_fasta", splited_output_dir="splited_blastp_output_dir") hits_dict = BLASTp.extract_hits_from_tbl_output( output_swissprot_blastp_hits, output_swissprot_blastp_hits_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_swissprot_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_swissprot_supported_transcripts_ids, output_swissprot_supported_genes_ids) os.system(remove_transcript_ids_str) """
output_pep, output_hmmscan, num_of_seqs_per_scan=None, split_dir="splited_hmmscan_fasta/", splited_output_dir="splited_hmmscan_output_dir", tblout_outfile=None, domtblout_outfile=output_domtblout, pfamtblout_outfile=None, splited_tblout_dir=None, splited_domtblout_dir="hmmscan_domtblout/") HMMER3.extract_dom_ids_hits_from_domtblout(output_domtblout, output_pfam_annotated_dom_ids) hits_dict = HMMER3.extract_dom_names_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_pfam_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_pfam_supported_transcripts_ids, output_pfam_supported_genes_ids) os.system(remove_transcript_ids_str) if args.swissprot_db: print("Annotating peptides(Swissprot database)...") BLASTp.threads = args.threads BLASTp.parallel_blastp(output_pep, args.swissprot_db, evalue=0.0000001, output_format=6, outfile=output_swissprot_blastp_hits, split_dir="splited_blastp_fasta", splited_output_dir="splited_blastp_output_dir") hits_dict = BLASTp.extract_hits_from_tbl_output(
""" parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") """ args = parser.parse_args() # run after scripts/expansion/compare_cluster.py # out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_syn_dict = TwoLvlDict() for species in args.species_list: species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species)) species_syn_dict.write("families_all_species.t", absent_symbol=".") not_assembled = species_syn_dict.filter_by_line(is_assembled) species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".") assembled_ids = IdSet(species_syn_dict.sl_keys()) assembled_ids.write("assembled_families.ids") not_assembled_ids = IdSet(not_assembled.sl_keys()) not_assembled_ids.write("non_assembled_families.ids") """ if args.output != "stdout": out_fd.close() """
tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) sp_set.add(syn_id) complicated_families_syn_dict[sl_key] = sp_set complicated_families_syn_dict.write("complicated_families_connections.t", splited_values=True) for entry in complicated_families_dict.all_values(): tmp = entry.split(";") for i in range(0, len(tmp)): if "_" in tmp[i]: tmp[i] = tmp[i][2] tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) complicated_families_syn_ids.write("complicated_families_check.ids") nonassembled.write("splited_to_several_families.t", absent_symbol=".") assemled_to_different_families = species_syn_dict.filter_by_line(filter_different_assembly) species_syn_dict.write("correctly_assembled_families_in_all_species.t", absent_symbol=".") assemled_to_different_families.write("assembled_to_different_families_in_all_species.t", absent_symbol=".") correctly_assembled_families_synonym = IdList(set(species_syn_dict.all_values())) assemled_to_different_families_synonym = IdList(set(assemled_to_different_families.all_values())) correctly_assembled_families_synonym.write("correctly_assembled_families_syn_in_all_species.ids") assemled_to_different_families_synonym.write("assembled_to_different_families_syn_in_all_species.ids") if args.output != "output": out_fd.close()
accordance_dict[species].read(accordance_file, key_index=1, value_index=0) if args.name_first: def split_name(pep_name): gene_list = pep_name.split(args.name_separator) return gene_list[0], args.name_separator.join(gene_list[1:]) else: def split_name(pep_name): gene_list = pep_name.split(args.name_separator) return gene_list[-1], args.name_separator.join(gene_list[:-1]) families_with_errors = IdSet() for family in pep_fam_dict: cds_fam_dict[family] = [] for pep in pep_fam_dict[family]: species, pep_name = split_name(pep) if pep_name in accordance_dict[species]: cds_name = "%s%s%s" % (species, args.name_separator, accordance_dict[species][pep_name]) if args.name_first else \ "%s%s%s" % (accordance_dict[species][pep_name], args.name_separator, species) cds_fam_dict[family].append(cds_name) else: print("%s %s %s doesn't have associated cds in accordance file" % (family, species, pep_name)) families_with_errors.add(family) for family in families_with_errors: cds_fam_dict.pop(family, None) families_with_errors.write(args.fam_error) cds_fam_dict.write(args.output, splited_values=True)