def intersect_ids(list_of_group_a, list_of_group_b, mode="common"): # possible modes: common, only_a, only_b, not_common, combine, count a = IdSet() b = IdSet() if mode == "common": expression = lambda a, b: a & b elif mode == "only_a": expression = lambda a, b: a - b elif mode == "only_b": expression = lambda a, b: b - a elif mode == "not_common": expression = lambda a, b: a ^ b elif mode == "combine": expression = lambda a, b: a | b for id_list in list_of_group_a: a = a | IdSet(id_list) for id_list in list_of_group_b: b = b | IdSet(id_list) if mode != "count": return IdSet(expression(a, b)) else: return len(a), len(b), len(a & b), len(a - b), len(b - a), len(a ^ b), len(a | b)
def extract_monocluster_ids_from_file(self, dir_with_cluster_files, out_file, file_with_white_list_ids=None): # filenames are counted as species names white_list_ids = None if file_with_white_list_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) monoclusters = self.extract_monocluster_ids( clusters_dict, out_file=out_file, white_list_ids=white_list_ids) return monoclusters
def get_sequence_names(clusters_dict, write_ids=False, out_prefix=None, white_list_ids=None): sequence_names_dict = SynDict() for species in clusters_dict: sequence_names_dict[species] = IdSet() for species in clusters_dict: for cluster_id in clusters_dict[species]: if white_list_ids: if cluster_id not in white_list_ids: continue sequence_names_dict[species] = sequence_names_dict[ species] | IdSet(clusters_dict[species][cluster_id]) if write_ids: for species in clusters_dict: out_file = "%s_%s.ids" % ( out_prefix, species) if out_prefix else "%s.ids" % species sequence_names_dict[species].write(out_file) return sequence_names_dict
def extract_monocluster_ids(self, clusters_dict, white_list_ids=None, out_file=None): """ Extracts clusters with only one sequence in all species. """ monocluster_ids = IdSet() cluster_names = self.get_cluster_names(clusters_dict) for cluster_name in cluster_names: for species in clusters_dict: if white_list_ids: if cluster_name not in white_list_ids: break if cluster_name not in clusters_dict[species]: break if len(clusters_dict[species][cluster_name]) > 1: break else: monocluster_ids.add(cluster_name) if out_file: monocluster_ids.write(out_file) return monocluster_ids
def rename_scaffolds_in_gff(self, input_gff, syn_file, output_prefix, verbose=True): syn_dict = SynDict(filename=syn_file) skipped_id_list = IdSet() output_gff = "%s.renamed.gff" % output_prefix skipped_gff = "%s.skipped.gff" % output_prefix skipped_id_file = "%s.skipped_scaffolds.ids" % output_prefix with self.metaopen(input_gff, "r") as in_fd, \ self.metaopen(output_gff, "w") as out_fd, \ self.metaopen(skipped_gff, "w") as skipped_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) gff_list = line.split("\t") if gff_list[0] in syn_dict: gff_list[0] = syn_dict[gff_list[0]] out_fd.write("\t".join(gff_list)) else: skipped_fd.write(line) skipped_id_list.add(gff_list[0]) if verbose: print("Not renamed scaffolds: %i" % len(skipped_id_list)) skipped_id_list.write(skipped_id_file)
def get_column_value_set_from_file(self, input_file, column_number, output_file=None, separator="\t", comments_prefix="#", verbose=False): column_value_set = IdSet([ line_list[column_number] for line_list in self.file_line_as_list_generator(input_file, separator=separator, comments_prefix=comments_prefix) ]) if output_file: column_value_set.write(output_file) if verbose: print("#Column %i (0-based) contains %i different values" % (column_number, len(column_value_set))) return column_value_set
def get_cluster_names(clusters_dict, out_file=None, white_list_ids=None): cluster_names = IdSet() for species in clusters_dict: species_clusters = IdSet(clusters_dict[species].keys()) cluster_names |= species_clusters if out_file: cluster_names.write(out_file) return cluster_names & IdSet( white_list_ids) if white_list_ids else cluster_names
def get_scaffold_ids_from_gff(gff_file, out_file=None): scaffold_id_set = IdSet() with open(gff_file, "r") as gff_fd: for line in gff_fd: if line[0] == "#": continue scaffold_id = line.split("\t")[0] scaffold_id_set.add(scaffold_id) if out_file: scaffold_id_set.write(out_file) return scaffold_id_set
help="Input fam file") parser.add_argument("-d", "--id_file", action="store", dest="id_file", required=True, help="File with ids of families to extract") parser.add_argument("-o", "--output", action="store", dest="output", default="stdout", help="File to write extracted families. Default - stdout") parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Print not found ids. Default - no") args = parser.parse_args() out_file = sys.stdout if args.output == "stdout" else open(args.output, "w") fam_dict = SynDict() fam_dict.read(args.input) id_set = IdSet() id_set.read(args.id_file) extracted_dict = SynDict() for id_entry in id_set: if id_entry in fam_dict: extracted_dict[id_entry] = fam_dict[id_entry] else: if args.verbose: print("%s was not found" % id_entry) extracted_dict.write(out_file, close_after_if_file_object=True)
def intersect_ids_from_files(files_with_ids_from_group_a, files_with_ids_from_group_b, result_file=None, mode="common"): a = IdSet() b = IdSet() if mode == "common": expression = lambda a, b: a & b elif mode == "only_a": expression = lambda a, b: a - b elif mode == "only_b": expression = lambda a, b: b - a elif mode == "not_common": expression = lambda a, b: a ^ b elif mode == "combine": expression = lambda a, b: a | b #print(files_with_ids_from_group_a) for filename in [files_with_ids_from_group_a] if isinstance(files_with_ids_from_group_a, str) else files_with_ids_from_group_a: id_set = IdSet() id_set.read(filename, comments_prefix="#") a = a | id_set for filename in [files_with_ids_from_group_b] if isinstance(files_with_ids_from_group_b, str) else files_with_ids_from_group_b: id_set = IdSet() id_set.read(filename, comments_prefix="#") b = b | id_set result_fd = open(result_file, "w") if result_file else sys.stdout if mode != "count": final_set = IdSet(expression(a, b)) final_set.write(result_fd) else: result_fd.write("Group_A\t%i\nGroup_B\t%i\nCommon\t%i\nOnly_group_A\t%i\nOnly_group_B\t%i\nNot_common\t%i\nAll\t%i\n" % (len(a), len(b), len(a & b), len(a - b), len(b - a), len(a ^ b), len(a | b)))
def extract_transcripts_by_ids(self, input_gff, transcript_id_file, output_gff): transcript_ids = IdSet() transcript_ids.read(transcript_id_file, header=False) GFF.write(self.record_with_extracted_transcripts_generator(input_gff, transcript_ids), open(output_gff, "w"))
for species in args.species_list: species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species)) species_syn_dict.write("families_all_species.t", absent_symbol=".") nonassembled = species_syn_dict.filter_by_line(filter_nonassembled) species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".") nonassembled.write("not_assembled_families_in_all_species.t", absent_symbol=".") complicated_families_dict = nonassembled.filter_by_line(filter_splited_to_several_fam) complicated_families_dict.write("complicated_families.t", absent_symbol=".") complicated_families_syn_dict = SynDict() complicated_families_syn_ids = IdSet() sl_keys = list(complicated_families_dict.sl_keys()) for sl_key in sl_keys: sp_set = set() for species in complicated_families_dict: if sl_key not in complicated_families_dict[species]: continue tmp = complicated_families_dict[species][sl_key].split(";") for i in range(0, len(tmp)): if "_" in tmp[i]: tmp[i] = tmp[i][2:] tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) sp_set.add(syn_id) complicated_families_syn_dict[sl_key] = sp_set
def extract_sequences_by_clusters(self, dir_with_cluster_files, dir_with_sequence_files, output_dir, file_with_white_list_cluster_ids=None, mode="families", sequence_file_extension="fasta", sequence_file_format="fasta", label_species=False, separator_for_labeling="@", species_label_first=True): """ basenames of cluster and sequence files must be same mode: clusters - extract sequences from clusters in separate files, species - extract sequences from species to separate files """ white_list_ids = None if file_with_white_list_cluster_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_cluster_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) cluster_names = self.get_cluster_names(clusters_dict, white_list_ids=white_list_ids) sequence_super_dict = OrderedDict() out_dir = FileRoutines.check_path(output_dir) for species in clusters_dict: idx_file = "%s_tmp.idx" % species sequence_file = "%s%s.%s" % (FileRoutines.check_path( dir_with_sequence_files), species, sequence_file_extension) sequence_super_dict[species] = SeqIO.index_db( idx_file, sequence_file, format=sequence_file_format) if mode == "species": seqeuence_names = self.get_sequence_names( clusters_dict, write_ids=False, out_prefix=None, white_list_ids=white_list_ids) for species in seqeuence_names: out_file = "%s%s.%s" % (out_dir, species, sequence_file_extension) SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_super_dict[species], seqeuence_names[species]), out_file, format=sequence_file_format) elif mode == "families": def per_family_record_generator(seq_super_dict, clust_dict, cluster_id): if species_label_first: label_sequence = lambda label, name: "%s%s%s" % ( label, separator_for_labeling, name) else: label_sequence = lambda label, name: "%s%s%s" % ( name, separator_for_labeling, label) for species in seq_super_dict: #print species, cluster_id for record_id in clust_dict[species][cluster_id]: if label_species: record = deepcopy( seq_super_dict[species][record_id]) record.id = label_sequence(species, record_id) yield record else: yield seq_super_dict[species][record_id] for cluster_name in cluster_names: out_file = "%s%s.%s" % (out_dir, cluster_name, sequence_file_extension) SeqIO.write(per_family_record_generator( sequence_super_dict, clusters_dict, cluster_name), out_file, format=sequence_file_format) for species in clusters_dict: os.remove("%s_tmp.idx" % species)
""" parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") """ args = parser.parse_args() # run after scripts/expansion/compare_cluster.py # out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_syn_dict = TwoLvlDict() for species in args.species_list: species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species)) species_syn_dict.write("families_all_species.t", absent_symbol=".") not_assembled = species_syn_dict.filter_by_line(is_assembled) species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".") assembled_ids = IdSet(species_syn_dict.sl_keys()) assembled_ids.write("assembled_families.ids") not_assembled_ids = IdSet(not_assembled.sl_keys()) not_assembled_ids.write("non_assembled_families.ids") """ if args.output != "stdout": out_fd.close() """
def predict_genes(self, output_prefix, annotation_species_prefix, genome_fasta, augustus_species, output_directory="./", augustus_strand=None, augustus_gene_model=None, augustus_config_dir=None, augustus_use_softmasking=None, augustus_other_options="", augustus_hintsfile=None, augustus_extrinsicCfgFile=None, augustus_predict_UTR=None, augustus_min_intron_len=None, threads=1, augustus_dir="", hmmer_dir="", blast_dir="", stop_codons_list=("TGA", "TAA", "TAG"), genetic_code_table=1): draft_file_prefix = "%s/raw/%s" % (output_directory, output_prefix) augustus_splited_input_dir = "%s/splited_input/" % output_directory augustus_splited_output_dir = "%s/splited_output_dir" % output_directory output_raw_gff = "%s.raw.gff" % draft_file_prefix output_gff = "%s.renamed.gff" % draft_file_prefix augustus_pep = "%s.pep" % draft_file_prefix AUGUSTUS.path = augustus_dir AUGUSTUS.threads = threads HMMER3.path = hmmer_dir HMMER3.threads = threads BLASTp.path = blast_dir BLASTp.threads = threads print("Annotating genes...") AUGUSTUS.parallel_predict( augustus_species, genome_fasta, output_raw_gff, strand=augustus_strand, gene_model=augustus_gene_model, output_gff3=True, other_options=augustus_other_options, config_dir=augustus_config_dir, use_softmasking=augustus_use_softmasking, hints_file=augustus_hintsfile, split_dir=augustus_splited_input_dir, splited_output_dir=augustus_splited_output_dir, extrinsicCfgFile=augustus_extrinsicCfgFile, predict_UTR=augustus_predict_UTR, combine_output_to_single_file=True, min_intron_len=augustus_min_intron_len) #replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8): AUGUSTUS.replace_augustus_ids(output_raw_gff, draft_file_prefix, species_prefix=annotation_species_prefix, number_of_digits_in_id=8) #extract_transcript_sequences(self, input_gff_file, genomic_fasta_file, output_prefix, coding_only=False) gffread_file_prefix = "%s.gffread" % draft_file_prefix gffread_transcripts_file, gffread_cds_file, gffread_pep_file = Gffread.extract_transcript_sequences( output_gff, genome_fasta, gffread_file_prefix) gffread_trimmed_cds = ".".join( gffread_cds_file.split(".")[:-1]) + ".trimmed.cds" gffread_trimmed_pep = ".".join( gffread_pep_file.split(".")[:-1]) + ".trimmed.pep" self.trim_cds_and_remove_terminal_stop_codons( gffread_cds_file, gffread_trimmed_cds, stop_codons_list=stop_codons_list ) # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins inframe_stop_codons_file_prefix = "%s.inframe_stop_codon" % draft_file_prefix self.translate_sequences_from_file( gffread_trimmed_cds, gffread_trimmed_pep, format="fasta", id_expression=None, genetic_code_table=genetic_code_table, translate_to_stop=False, prefix_of_file_inframe_stop_codons_seqsin= inframe_stop_codons_file_prefix) # Universal code !!! AUGUSTUS.extract_gene_ids_from_output(output_gff, all_annotated_genes_ids) AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff) print("Extracting peptides...") AUGUSTUS.extract_proteins_from_output( output_gff, output_pep, id_prefix="", evidence_stats_file=output_evidence_stats, supported_by_hints_file=output_supported_stats) self.compare_sequences_from_files(output_pep, "%s.trimmed.pep" % args.output, "comparison_of_peptides", format="fasta", verbose=True) os.system("awk -F'\\t' 'NR==1 {}; NR > 1 {print $2}' %s > %s" % (output_supported_stats, output_supported_stats_ids)) print("Annotating domains(Pfam database)...") HMMER3.parallel_hmmscan( args.pfam_db, output_pep, output_hmmscan, num_of_seqs_per_scan=None, split_dir="splited_hmmscan_fasta/", splited_output_dir="splited_hmmscan_output_dir", tblout_outfile=None, domtblout_outfile=output_domtblout, pfamtblout_outfile=None, splited_tblout_dir=None, splited_domtblout_dir="hmmscan_domtblout/") HMMER3.extract_dom_ids_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_ids) hits_dict = HMMER3.extract_dom_names_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_pfam_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_pfam_supported_transcripts_ids, output_pfam_supported_genes_ids) os.system(remove_transcript_ids_str) print("Annotating peptides(Swissprot database)...") BLASTp.parallel_blastp(output_pep, args.swissprot_db, evalue=0.0000001, output_format=6, outfile=output_swissprot_blastp_hits, split_dir="splited_blastp_fasta", splited_output_dir="splited_blastp_output_dir") hits_dict = BLASTp.extract_hits_from_tbl_output( output_swissprot_blastp_hits, output_swissprot_blastp_hits_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_swissprot_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_swissprot_supported_transcripts_ids, output_swissprot_supported_genes_ids) os.system(remove_transcript_ids_str) """
HMMER3.parallel_hmmscan(args.pfam_db, output_pep, output_hmmscan, num_of_seqs_per_scan=None, split_dir="splited_hmmscan_fasta/", splited_output_dir="splited_hmmscan_output_dir", tblout_outfile=None, domtblout_outfile=output_domtblout, pfamtblout_outfile=None, splited_tblout_dir=None, splited_domtblout_dir="hmmscan_domtblout/") HMMER3.extract_dom_ids_hits_from_domtblout(output_domtblout, output_pfam_annotated_dom_ids) hits_dict = HMMER3.extract_dom_names_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_pfam_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_pfam_supported_transcripts_ids, output_pfam_supported_genes_ids) os.system(remove_transcript_ids_str) if args.swissprot_db: print("Annotating peptides(Swissprot database)...") BLASTp.threads = args.threads BLASTp.parallel_blastp(output_pep, args.swissprot_db, evalue=0.0000001, output_format=6, outfile=output_swissprot_blastp_hits, split_dir="splited_blastp_fasta", splited_output_dir="splited_blastp_output_dir")
def convert_rm_out_to_gff(input_file, output_file, annotated_repeat_classes_file, annotated_repeat_families_file): repeat_classes_set = IdSet() repeat_families_set = IdSet() with open(input_file, "r") as in_fd: for i in range(0, 3): in_fd.readline() with open(output_file, "w") as out_fd: for line in in_fd: tmp = line.strip().split() strand = "+" if tmp[8] == "+" else "-" repeat_class_family = tmp[10].split("/") if len(repeat_class_family) == 1: repeat_class_family.append(".") repeat_classes_set.add(repeat_class_family[0]) repeat_families_set.add("/".join(repeat_class_family)) parameters = "Class=%s;Family=%s;Matching_repeat=%s;SW_score=%s;Perc_div=%s;Perc_del=%s;Pers_ins=%s" \ % (repeat_class_family[0], repeat_class_family[1], tmp[9], tmp[0], tmp[1], tmp[2], tmp[3]) out_fd.write( "%s\tRepeatMasker\trepeat\t%s\t%s\t.\t%s\t.\t%s\n" % (tmp[4], tmp[5], tmp[6], strand, parameters)) repeat_classes_set.write(annotated_repeat_classes_file) repeat_families_set.write(annotated_repeat_families_file)
type=FileRoutines.make_list_of_path_to_files_from_string, help="Comma-separated list of input files/directories with sequences") parser.add_argument("-o", "--output_directory", action="store", dest="output", type=FileRoutines.check_path, help="Directory to output groups_of sequences") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of input and output files. Allowed formats genbank, fasta(default)") parser.add_argument("-e", "--extension", action="store", dest="extension", help="Extension of output files. Default: equal to -f") parser.add_argument("-d", "--id_file", action="store", dest="id_file", help="File with groups of sequences to extract(.fam file).") args = parser.parse_args() FileRoutines.safe_mkdir(args.output) args.extension = args.extension if args.extension else args.format tmp_index_file = "temp.idx" #id_list = read_ids(args.id_file) id_list = IdSet(filename=args.id_file) sequence_groups_id = SynDict() sequence_groups_id.read(args.id_file, split_values=True) #print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format) for group in sequence_groups_id: SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict, sequence_groups_id[group], verbose=True), "%s%s.%s" % (args.output, group, args.extension), format=args.format) os.remove(tmp_index_file)
accordance_file = "%s/%s.accordance" % (args.accordance_dir, species) accordance_dict[species] = SynDict() accordance_dict[species].read(accordance_file, key_index=1, value_index=0) if args.name_first: def split_name(pep_name): gene_list = pep_name.split(args.name_separator) return gene_list[0], args.name_separator.join(gene_list[1:]) else: def split_name(pep_name): gene_list = pep_name.split(args.name_separator) return gene_list[-1], args.name_separator.join(gene_list[:-1]) families_with_errors = IdSet() for family in pep_fam_dict: cds_fam_dict[family] = [] for pep in pep_fam_dict[family]: species, pep_name = split_name(pep) if pep_name in accordance_dict[species]: cds_name = "%s%s%s" % (species, args.name_separator, accordance_dict[species][pep_name]) if args.name_first else \ "%s%s%s" % (accordance_dict[species][pep_name], args.name_separator, species) cds_fam_dict[family].append(cds_name) else: print("%s %s %s doesn't have associated cds in accordance file" % (family, species, pep_name)) families_with_errors.add(family) for family in families_with_errors: cds_fam_dict.pop(family, None)