예제 #1
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse

from Tools.HMMER import HMMER3

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_hmm",
                    action="store",
                    dest="input",
                    help="Input hmm3 file")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file with ids. Default: stdout")

args = parser.parse_args()

HMMER3.get_ids_from_hmm3(
    args.input,
    ids_file=None if args.output == 'stdout' else args.output,
    return_ids_list=False)
예제 #2
0
                    help="Directory to write intermediate(splited) parseable table of per-sequence hits")
parser.add_argument("--domtblout_dir", action="store", dest="domtblout_dir",
                    default="domtblout_dir", type=check_path,
                    help="Directory to write intermediate(splited) parseable table of per-domain hits")
parser.add_argument("--pfamtblout_dir", action="store", dest="pfamtblout_dir",
                    default="pfamtblout_dir", type=check_path,
                    help="Directory to write intermediate(splited) table of hits and domains to file, in Pfam format ")

parser.add_argument("--tblout", action="store", dest="tblout",
                    help="File to save parseable table of per-sequence hits")
parser.add_argument("--domtblout", action="store", dest="domtblout",
                    help="File to save parseable table of per-domain hits")
parser.add_argument("--pfamtblout", action="store", dest="pfamtblout",
                    help="File to save table of hits and domains to file, in Pfam format ")
parser.add_argument("--hmmer_dir", action="store", dest="path", default="",
                    help="Path to directory with hmmer3.1 binaries")
args = parser.parse_args()


HMMER3.threads = 1
HMMER3.path = args.path
HMMER3.parallel_hmmscan(args.input, args.input_seq, args.output, num_of_seqs_per_scan=None, split_dir="splited_fasta",
                        splited_output_dir=args.hmmscan_output_dir, threads=args.threads,
                        combine_output_to_single_file=args.combine_output, dont_output_alignments=args.no_alignment,
                        tblout_outfile=args.tblout, domtblout_outfile=args.domtblout,
                        pfamtblout_outfile=args.pfamtblout,
                        splited_tblout_dir=args.tblout_dir, splited_domtblout_dir=args.domtblout_dir,
                        splited_pfamtblout_dir=args.pfamtblout_dir
                        )

예제 #3
0
    def predict_genes(self,
                      output_prefix,
                      annotation_species_prefix,
                      genome_fasta,
                      augustus_species,
                      output_directory="./",
                      augustus_strand=None,
                      augustus_gene_model=None,
                      augustus_config_dir=None,
                      augustus_use_softmasking=None,
                      augustus_other_options="",
                      augustus_hintsfile=None,
                      augustus_extrinsicCfgFile=None,
                      augustus_predict_UTR=None,
                      augustus_min_intron_len=None,
                      threads=1,
                      augustus_dir="",
                      hmmer_dir="",
                      blast_dir="",
                      stop_codons_list=("TGA", "TAA", "TAG"),
                      genetic_code_table=1):

        draft_file_prefix = "%s/raw/%s" % (output_directory, output_prefix)

        augustus_splited_input_dir = "%s/splited_input/" % output_directory
        augustus_splited_output_dir = "%s/splited_output_dir" % output_directory

        output_raw_gff = "%s.raw.gff" % draft_file_prefix
        output_gff = "%s.renamed.gff" % draft_file_prefix
        augustus_pep = "%s.pep" % draft_file_prefix

        AUGUSTUS.path = augustus_dir
        AUGUSTUS.threads = threads
        HMMER3.path = hmmer_dir
        HMMER3.threads = threads
        BLASTp.path = blast_dir
        BLASTp.threads = threads

        print("Annotating genes...")
        AUGUSTUS.parallel_predict(
            augustus_species,
            genome_fasta,
            output_raw_gff,
            strand=augustus_strand,
            gene_model=augustus_gene_model,
            output_gff3=True,
            other_options=augustus_other_options,
            config_dir=augustus_config_dir,
            use_softmasking=augustus_use_softmasking,
            hints_file=augustus_hintsfile,
            split_dir=augustus_splited_input_dir,
            splited_output_dir=augustus_splited_output_dir,
            extrinsicCfgFile=augustus_extrinsicCfgFile,
            predict_UTR=augustus_predict_UTR,
            combine_output_to_single_file=True,
            min_intron_len=augustus_min_intron_len)

        #replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8):

        AUGUSTUS.replace_augustus_ids(output_raw_gff,
                                      draft_file_prefix,
                                      species_prefix=annotation_species_prefix,
                                      number_of_digits_in_id=8)
        #extract_transcript_sequences(self, input_gff_file, genomic_fasta_file, output_prefix, coding_only=False)
        gffread_file_prefix = "%s.gffread" % draft_file_prefix
        gffread_transcripts_file, gffread_cds_file, gffread_pep_file = Gffread.extract_transcript_sequences(
            output_gff, genome_fasta, gffread_file_prefix)
        gffread_trimmed_cds = ".".join(
            gffread_cds_file.split(".")[:-1]) + ".trimmed.cds"
        gffread_trimmed_pep = ".".join(
            gffread_pep_file.split(".")[:-1]) + ".trimmed.pep"
        self.trim_cds_and_remove_terminal_stop_codons(
            gffread_cds_file,
            gffread_trimmed_cds,
            stop_codons_list=stop_codons_list
        )  # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins
        inframe_stop_codons_file_prefix = "%s.inframe_stop_codon" % draft_file_prefix
        self.translate_sequences_from_file(
            gffread_trimmed_cds,
            gffread_trimmed_pep,
            format="fasta",
            id_expression=None,
            genetic_code_table=genetic_code_table,
            translate_to_stop=False,
            prefix_of_file_inframe_stop_codons_seqsin=
            inframe_stop_codons_file_prefix)  # Universal code !!!

        AUGUSTUS.extract_gene_ids_from_output(output_gff,
                                              all_annotated_genes_ids)
        AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff)

        print("Extracting peptides...")

        AUGUSTUS.extract_proteins_from_output(
            output_gff,
            output_pep,
            id_prefix="",
            evidence_stats_file=output_evidence_stats,
            supported_by_hints_file=output_supported_stats)

        self.compare_sequences_from_files(output_pep,
                                          "%s.trimmed.pep" % args.output,
                                          "comparison_of_peptides",
                                          format="fasta",
                                          verbose=True)

        os.system("awk -F'\\t' 'NR==1 {}; NR > 1 {print $2}' %s > %s" %
                  (output_supported_stats, output_supported_stats_ids))

        print("Annotating domains(Pfam database)...")

        HMMER3.parallel_hmmscan(
            args.pfam_db,
            output_pep,
            output_hmmscan,
            num_of_seqs_per_scan=None,
            split_dir="splited_hmmscan_fasta/",
            splited_output_dir="splited_hmmscan_output_dir",
            tblout_outfile=None,
            domtblout_outfile=output_domtblout,
            pfamtblout_outfile=None,
            splited_tblout_dir=None,
            splited_domtblout_dir="hmmscan_domtblout/")
        HMMER3.extract_dom_ids_hits_from_domtblout(
            output_domtblout, output_pfam_annotated_dom_ids)
        hits_dict = HMMER3.extract_dom_names_hits_from_domtblout(
            output_domtblout, output_pfam_annotated_dom_names)
        supported_ids = IdSet(hits_dict.keys())
        supported_ids.write(output_pfam_supported_transcripts_ids)
        remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % (
            output_pfam_supported_transcripts_ids,
            output_pfam_supported_genes_ids)
        os.system(remove_transcript_ids_str)

        print("Annotating peptides(Swissprot database)...")

        BLASTp.parallel_blastp(output_pep,
                               args.swissprot_db,
                               evalue=0.0000001,
                               output_format=6,
                               outfile=output_swissprot_blastp_hits,
                               split_dir="splited_blastp_fasta",
                               splited_output_dir="splited_blastp_output_dir")
        hits_dict = BLASTp.extract_hits_from_tbl_output(
            output_swissprot_blastp_hits, output_swissprot_blastp_hits_names)
        supported_ids = IdSet(hits_dict.keys())
        supported_ids.write(output_swissprot_supported_transcripts_ids)

        remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % (
            output_swissprot_supported_transcripts_ids,
            output_swissprot_supported_genes_ids)
        os.system(remove_transcript_ids_str)
        """
예제 #4
0
                                              "comparison_of_peptides",
                                              format="fasta",
                                              verbose=True)

os.system("awk -F'\\t' 'NR==1 {}; NR > 1 {print $2}' %s > %s" %
          (output_supported_stats, output_supported_stats_ids))

if args.pfam_db:
    print("Annotating domains(Pfam database)...")
    HMMER3.threads = args.threads
    HMMER3.parallel_hmmscan(args.pfam_db,
                            output_pep,
                            output_hmmscan,
                            num_of_seqs_per_scan=None,
                            split_dir="splited_hmmscan_fasta/",
                            splited_output_dir="splited_hmmscan_output_dir",
                            tblout_outfile=None,
                            domtblout_outfile=output_domtblout,
                            pfamtblout_outfile=None,
                            splited_tblout_dir=None,
                            splited_domtblout_dir="hmmscan_domtblout/")
    HMMER3.extract_dom_ids_hits_from_domtblout(output_domtblout,
                                               output_pfam_annotated_dom_ids)
    hits_dict = HMMER3.extract_dom_names_hits_from_domtblout(
        output_domtblout, output_pfam_annotated_dom_names)
    supported_ids = IdSet(hits_dict.keys())
    supported_ids.write(output_pfam_supported_transcripts_ids)
    remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % (
        output_pfam_supported_transcripts_ids, output_pfam_supported_genes_ids)
    os.system(remove_transcript_ids_str)
예제 #5
0
                    "--header",
                    action="store_true",
                    dest="header",
                    help="Header is present in id file. Default: False")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    required=True,
                    help="Format of the file with hits")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")
id_list = IdList()
id_list = id_list.read(args.id_file, header=args.header)

HMMER3.extract_hits_by_query_ids(id_list,
                                 args.input,
                                 args.output,
                                 fileformat=args.format,
                                 close_after_if_file_object=True)

out_fd.close()
예제 #6
0
blastp_split_dir = "%ssplited_fasta_dir/" % blastp_dir
blastp_splited_output_dir = "%ssplited_output_dir" % blastp_dir
HMMER3.path = args.hmmer_dir
HMMER3.threads = args.threads
BLASTp.threads = args.threads

TransDecoder.extract_longest_orfs(
    args.input,
    genetic_code=args.genetic_code,
    analyze_only_top_strand=args.analyze_only_top_strand,
    minimum_protein_length=args.min_prot_len)
if args.pfam_database:
    HMMER3.parallel_hmmscan(args.pfam_database,
                            pep_from_longest_orfs,
                            hmmscan_vs_pfam_output,
                            split_dir=hmmscan_splited_fasta_dir,
                            splited_domtblout_dir=splited_domtblout_dir,
                            domtblout_outfile=domtblout_outfile,
                            dont_output_alignments=True)
if args.blast_database:
    BLASTp.parallel_blastp(pep_from_longest_orfs,
                           args.blast_database,
                           outfile=blastp_outfile,
                           evalue=0.00001,
                           output_format=6,
                           blast_options=" -max_target_seqs 1",
                           combine_output_to_single_file=True,
                           split_dir=blastp_split_dir,
                           splited_output_dir=blastp_splited_output_dir)

TransDecoder.predict_pep(
예제 #7
0
tblout_file = "%s.tblout" % args.output_prefix
domtblout_file = "%s.domtblout" % args.output_prefix
pfamtblout_file = "%s.pfamtblout" % args.output_prefix

HMMER3.threads = 1
HMMER3.path = args.path
HMMER3.timelog = "%s.timelog" % args.output_prefix

HMMER3.parallel_hmmscan(args.input,
                        args.input_seq,
                        hits_file,
                        num_of_seqs_per_scan=None,
                        split_dir="splited_fasta",
                        splited_output_dir=args.hmmscan_output_dir,
                        threads=args.threads,
                        combine_output_to_single_file=True,
                        dont_output_alignments=args.no_alignment,
                        tblout_outfile=tblout_file,
                        domtblout_outfile=domtblout_file,
                        pfamtblout_outfile=pfamtblout_file,
                        splited_tblout_dir=args.tblout_dir,
                        splited_domtblout_dir=args.domtblout_dir,
                        splited_pfamtblout_dir=args.pfamtblout_dir,
                        biopython_165_compartibility=True)

HMMER3.extract_top_hits(hits_file,
                        top_hits_file,
                        not_significant_ids_file=not_significant_ids_file,
                        not_found_ids_file=not_found_ids_file)
HMMER3.get_families_from_top_hits(top_hits_file, fam_file)
예제 #8
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse

from Tools.HMMER import HMMER3

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_hmm",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input hmmer3 domtblout file")
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Output prefix")

args = parser.parse_args()

output_pfam_annotated_dom_ids = "%s.dom_ids" % args.output_prefix
output_pfam_annotated_dom_names = "%s.dom_names" % args.output_prefix

HMMER3.extract_dom_ids_hits_from_domtblout(args.input,
                                           output_pfam_annotated_dom_ids)
HMMER3.extract_dom_names_hits_from_domtblout(args.input,
                                             output_pfam_annotated_dom_names)