#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from Tools.HMMER import HMMER3 parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_hmm", action="store", dest="input", help="Input hmm3 file") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file with ids. Default: stdout") args = parser.parse_args() HMMER3.get_ids_from_hmm3( args.input, ids_file=None if args.output == 'stdout' else args.output, return_ids_list=False)
help="Directory to write intermediate(splited) parseable table of per-sequence hits") parser.add_argument("--domtblout_dir", action="store", dest="domtblout_dir", default="domtblout_dir", type=check_path, help="Directory to write intermediate(splited) parseable table of per-domain hits") parser.add_argument("--pfamtblout_dir", action="store", dest="pfamtblout_dir", default="pfamtblout_dir", type=check_path, help="Directory to write intermediate(splited) table of hits and domains to file, in Pfam format ") parser.add_argument("--tblout", action="store", dest="tblout", help="File to save parseable table of per-sequence hits") parser.add_argument("--domtblout", action="store", dest="domtblout", help="File to save parseable table of per-domain hits") parser.add_argument("--pfamtblout", action="store", dest="pfamtblout", help="File to save table of hits and domains to file, in Pfam format ") parser.add_argument("--hmmer_dir", action="store", dest="path", default="", help="Path to directory with hmmer3.1 binaries") args = parser.parse_args() HMMER3.threads = 1 HMMER3.path = args.path HMMER3.parallel_hmmscan(args.input, args.input_seq, args.output, num_of_seqs_per_scan=None, split_dir="splited_fasta", splited_output_dir=args.hmmscan_output_dir, threads=args.threads, combine_output_to_single_file=args.combine_output, dont_output_alignments=args.no_alignment, tblout_outfile=args.tblout, domtblout_outfile=args.domtblout, pfamtblout_outfile=args.pfamtblout, splited_tblout_dir=args.tblout_dir, splited_domtblout_dir=args.domtblout_dir, splited_pfamtblout_dir=args.pfamtblout_dir )
def predict_genes(self, output_prefix, annotation_species_prefix, genome_fasta, augustus_species, output_directory="./", augustus_strand=None, augustus_gene_model=None, augustus_config_dir=None, augustus_use_softmasking=None, augustus_other_options="", augustus_hintsfile=None, augustus_extrinsicCfgFile=None, augustus_predict_UTR=None, augustus_min_intron_len=None, threads=1, augustus_dir="", hmmer_dir="", blast_dir="", stop_codons_list=("TGA", "TAA", "TAG"), genetic_code_table=1): draft_file_prefix = "%s/raw/%s" % (output_directory, output_prefix) augustus_splited_input_dir = "%s/splited_input/" % output_directory augustus_splited_output_dir = "%s/splited_output_dir" % output_directory output_raw_gff = "%s.raw.gff" % draft_file_prefix output_gff = "%s.renamed.gff" % draft_file_prefix augustus_pep = "%s.pep" % draft_file_prefix AUGUSTUS.path = augustus_dir AUGUSTUS.threads = threads HMMER3.path = hmmer_dir HMMER3.threads = threads BLASTp.path = blast_dir BLASTp.threads = threads print("Annotating genes...") AUGUSTUS.parallel_predict( augustus_species, genome_fasta, output_raw_gff, strand=augustus_strand, gene_model=augustus_gene_model, output_gff3=True, other_options=augustus_other_options, config_dir=augustus_config_dir, use_softmasking=augustus_use_softmasking, hints_file=augustus_hintsfile, split_dir=augustus_splited_input_dir, splited_output_dir=augustus_splited_output_dir, extrinsicCfgFile=augustus_extrinsicCfgFile, predict_UTR=augustus_predict_UTR, combine_output_to_single_file=True, min_intron_len=augustus_min_intron_len) #replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8): AUGUSTUS.replace_augustus_ids(output_raw_gff, draft_file_prefix, species_prefix=annotation_species_prefix, number_of_digits_in_id=8) #extract_transcript_sequences(self, input_gff_file, genomic_fasta_file, output_prefix, coding_only=False) gffread_file_prefix = "%s.gffread" % draft_file_prefix gffread_transcripts_file, gffread_cds_file, gffread_pep_file = Gffread.extract_transcript_sequences( output_gff, genome_fasta, gffread_file_prefix) gffread_trimmed_cds = ".".join( gffread_cds_file.split(".")[:-1]) + ".trimmed.cds" gffread_trimmed_pep = ".".join( gffread_pep_file.split(".")[:-1]) + ".trimmed.pep" self.trim_cds_and_remove_terminal_stop_codons( gffread_cds_file, gffread_trimmed_cds, stop_codons_list=stop_codons_list ) # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins inframe_stop_codons_file_prefix = "%s.inframe_stop_codon" % draft_file_prefix self.translate_sequences_from_file( gffread_trimmed_cds, gffread_trimmed_pep, format="fasta", id_expression=None, genetic_code_table=genetic_code_table, translate_to_stop=False, prefix_of_file_inframe_stop_codons_seqsin= inframe_stop_codons_file_prefix) # Universal code !!! AUGUSTUS.extract_gene_ids_from_output(output_gff, all_annotated_genes_ids) AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff) print("Extracting peptides...") AUGUSTUS.extract_proteins_from_output( output_gff, output_pep, id_prefix="", evidence_stats_file=output_evidence_stats, supported_by_hints_file=output_supported_stats) self.compare_sequences_from_files(output_pep, "%s.trimmed.pep" % args.output, "comparison_of_peptides", format="fasta", verbose=True) os.system("awk -F'\\t' 'NR==1 {}; NR > 1 {print $2}' %s > %s" % (output_supported_stats, output_supported_stats_ids)) print("Annotating domains(Pfam database)...") HMMER3.parallel_hmmscan( args.pfam_db, output_pep, output_hmmscan, num_of_seqs_per_scan=None, split_dir="splited_hmmscan_fasta/", splited_output_dir="splited_hmmscan_output_dir", tblout_outfile=None, domtblout_outfile=output_domtblout, pfamtblout_outfile=None, splited_tblout_dir=None, splited_domtblout_dir="hmmscan_domtblout/") HMMER3.extract_dom_ids_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_ids) hits_dict = HMMER3.extract_dom_names_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_pfam_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_pfam_supported_transcripts_ids, output_pfam_supported_genes_ids) os.system(remove_transcript_ids_str) print("Annotating peptides(Swissprot database)...") BLASTp.parallel_blastp(output_pep, args.swissprot_db, evalue=0.0000001, output_format=6, outfile=output_swissprot_blastp_hits, split_dir="splited_blastp_fasta", splited_output_dir="splited_blastp_output_dir") hits_dict = BLASTp.extract_hits_from_tbl_output( output_swissprot_blastp_hits, output_swissprot_blastp_hits_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_swissprot_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_swissprot_supported_transcripts_ids, output_swissprot_supported_genes_ids) os.system(remove_transcript_ids_str) """
"comparison_of_peptides", format="fasta", verbose=True) os.system("awk -F'\\t' 'NR==1 {}; NR > 1 {print $2}' %s > %s" % (output_supported_stats, output_supported_stats_ids)) if args.pfam_db: print("Annotating domains(Pfam database)...") HMMER3.threads = args.threads HMMER3.parallel_hmmscan(args.pfam_db, output_pep, output_hmmscan, num_of_seqs_per_scan=None, split_dir="splited_hmmscan_fasta/", splited_output_dir="splited_hmmscan_output_dir", tblout_outfile=None, domtblout_outfile=output_domtblout, pfamtblout_outfile=None, splited_tblout_dir=None, splited_domtblout_dir="hmmscan_domtblout/") HMMER3.extract_dom_ids_hits_from_domtblout(output_domtblout, output_pfam_annotated_dom_ids) hits_dict = HMMER3.extract_dom_names_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_pfam_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_pfam_supported_transcripts_ids, output_pfam_supported_genes_ids) os.system(remove_transcript_ids_str)
"--header", action="store_true", dest="header", help="Header is present in id file. Default: False") parser.add_argument("-f", "--format", action="store", dest="format", required=True, help="Format of the file with hits") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") id_list = IdList() id_list = id_list.read(args.id_file, header=args.header) HMMER3.extract_hits_by_query_ids(id_list, args.input, args.output, fileformat=args.format, close_after_if_file_object=True) out_fd.close()
blastp_split_dir = "%ssplited_fasta_dir/" % blastp_dir blastp_splited_output_dir = "%ssplited_output_dir" % blastp_dir HMMER3.path = args.hmmer_dir HMMER3.threads = args.threads BLASTp.threads = args.threads TransDecoder.extract_longest_orfs( args.input, genetic_code=args.genetic_code, analyze_only_top_strand=args.analyze_only_top_strand, minimum_protein_length=args.min_prot_len) if args.pfam_database: HMMER3.parallel_hmmscan(args.pfam_database, pep_from_longest_orfs, hmmscan_vs_pfam_output, split_dir=hmmscan_splited_fasta_dir, splited_domtblout_dir=splited_domtblout_dir, domtblout_outfile=domtblout_outfile, dont_output_alignments=True) if args.blast_database: BLASTp.parallel_blastp(pep_from_longest_orfs, args.blast_database, outfile=blastp_outfile, evalue=0.00001, output_format=6, blast_options=" -max_target_seqs 1", combine_output_to_single_file=True, split_dir=blastp_split_dir, splited_output_dir=blastp_splited_output_dir) TransDecoder.predict_pep(
tblout_file = "%s.tblout" % args.output_prefix domtblout_file = "%s.domtblout" % args.output_prefix pfamtblout_file = "%s.pfamtblout" % args.output_prefix HMMER3.threads = 1 HMMER3.path = args.path HMMER3.timelog = "%s.timelog" % args.output_prefix HMMER3.parallel_hmmscan(args.input, args.input_seq, hits_file, num_of_seqs_per_scan=None, split_dir="splited_fasta", splited_output_dir=args.hmmscan_output_dir, threads=args.threads, combine_output_to_single_file=True, dont_output_alignments=args.no_alignment, tblout_outfile=tblout_file, domtblout_outfile=domtblout_file, pfamtblout_outfile=pfamtblout_file, splited_tblout_dir=args.tblout_dir, splited_domtblout_dir=args.domtblout_dir, splited_pfamtblout_dir=args.pfamtblout_dir, biopython_165_compartibility=True) HMMER3.extract_top_hits(hits_file, top_hits_file, not_significant_ids_file=not_significant_ids_file, not_found_ids_file=not_found_ids_file) HMMER3.get_families_from_top_hits(top_hits_file, fam_file)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from Tools.HMMER import HMMER3 parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_hmm", action="store", dest="input", required=True, help="Input hmmer3 domtblout file") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", required=True, help="Output prefix") args = parser.parse_args() output_pfam_annotated_dom_ids = "%s.dom_ids" % args.output_prefix output_pfam_annotated_dom_names = "%s.dom_names" % args.output_prefix HMMER3.extract_dom_ids_hits_from_domtblout(args.input, output_pfam_annotated_dom_ids) HMMER3.extract_dom_names_hits_from_domtblout(args.input, output_pfam_annotated_dom_names)