Beispiel #1
0
def extract_cterminus(da_dir, clusterpksgenes, seq_record, endinggene,
                      feature_by_id):
    #Extract C-terminal 100 residues of each non-ending protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
    ctermintresdict = {}
    ctermnames = []
    ctermseqs = []
    cterm_file = os.path.join(da_dir, 'cterm.fasta')
    for k in clusterpksgenes:
        if k != endinggene:
            ctermnames.append(k)
            seq = str(utils.get_aa_sequence(feature_by_id[k]))
            ctermseqs.append(seq[-100:])
    ctermfasta = "input.fasta"
    z = 0
    for k in ctermnames:
        utils.writefasta([ctermnames[z]], [ctermseqs[z]], ctermfasta)
        utils.execute([
            "muscle", "-profile", "-quiet", "-in1", cterm_file, "-in2",
            "input.fasta", "-out", "muscle.fasta"
        ])
        intresidues = extractpositions("muscle.fasta", [55, 64], "EryAII_ref",
                                       ctermnames[z])
        ctermintresdict[ctermnames[z]] = intresidues
        z += 1
    return ctermintresdict
Beispiel #2
0
def runblast(query, target):
    command = [
        "blastp", "-db", target, "-query", query, "-outfmt", "6",
        "-max_target_seqs", "10000", "-evalue", "1e-05", "-out",
        query.split(".")[0] + ".out"
    ]
    utils.execute(command)
Beispiel #3
0
def extract_nterminus(da_dir, clusterpksgenes, seq_record, startergene,
                      feature_by_id):
    #Extract N-terminal 50 residues of each non-starting protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
    ntermintresdict = {}
    ntermnames = []
    ntermseqs = []
    nterm_file = os.path.join(da_dir, 'nterm.fasta')
    for k in clusterpksgenes:
        if k != startergene:
            ntermnames.append(k)
            seq = str(utils.get_aa_sequence(feature_by_id[k]))
            ntermseqs.append(seq[:50])
    ntermfasta = "input.fasta"
    z = 0
    for k in ntermnames:
        utils.writefasta([ntermnames[z]], [ntermseqs[z]], ntermfasta)
        utils.execute([
            "muscle", "-profile", "-quiet", "-in1", nterm_file, "-in2",
            "input.fasta", "-out", "muscle.fasta"
        ])
        intresidues = extractpositions("muscle.fasta", [2, 15],
                                       "EryAIII_5_6_ref", ntermnames[z])
        ntermintresdict[ntermnames[z]] = intresidues
        z += 1
    return ntermintresdict
def run_pplacer(reference_alignment, alignment_file, data_dir, reference_tree):
    """Function that uses the reference tree with the new alignment to place
    query domains onto reference tree.
    """
    #Locations of files
    #reference_tree = os.path.join(data_dir, "RAxML_bestTree.647KS_RAxML.tre")
    pplacer_json = os.path.join(os.getcwd(), "pplacer_tree.jplace")

    #Reference package creation: taxit create --aln-fasta test_set_for_development.fasta --tree-stats test_set_for_development.log --tree-file test_set_for_development.nwk -P test_set_for_development.refpkg -l test
    #Reference package creation: taxit create --aln-fasta 647KS_mcformat.afa --tree-stats RAxML_info.647KS_RAxML.tre --tree-file RAxML_bestTree.647KS_RAxML.tre -P RAxML_bestTree.647KS_RAxML.refpkg -l 647KS

    pplacer_cmd = [
        "pplacer", "-t", reference_tree, "-r", reference_alignment, "-o",
        pplacer_json, "-c",
        os.path.join(data_dir,
                     "RAxML_bestTree.647KS_RAxML.refpkg"), alignment_file
    ]
    out, err, retcode = utils.execute(pplacer_cmd)
    if retcode == 1:
        logging.error(
            "Running pplacer failed. Check if the program is installed appropriately."
        )
        sys.exit(1)
    guppy_cmd = ["guppy", "sing", pplacer_json]
    out, err, retcode = utils.execute(guppy_cmd)
    if retcode == 1:
        logging.error(
            "Running guppy failed. Check if the program is installed appropriately."
        )
        sys.exit(1)
    return os.getcwd() + os.sep + "pplacer_tree.sing.tre"
Beispiel #5
0
def alignsmcogs(smcog, inputnr):
     #Align to multiple sequence alignment, output as fasta file
     infile1 = utils.get_full_path(__file__, "%s_muscle.fasta" % str(smcog).lower())
     if sys.platform == ('linux2') or sys.platform == ('win32'):
         musclecommand = ["muscle", "-quiet", "-profile", "-in1", infile1, "-in2", "input" + str(inputnr) + ".fasta", "-out", "muscle" + str(inputnr) + ".fasta"]
     elif sys.platform == ('darwin'):
         musclecommand = ["muscle", "-quiet", "-profile", "-in1", infile1, "-in2", "input" + str(inputnr) + ".fasta", "-out", "muscle" + str(inputnr) + ".fasta"]
     utils.execute(musclecommand)
Beispiel #6
0
def make_blastDB(query_fasta, options):
    db_dir = options.metabolicmodeldir + os.sep + 'targetBlastDB'
    DBprogramName = utils.locate_executable('makeblastdb')

    utils.execute(
        [DBprogramName, '-in', query_fasta, '-out', db_dir, '-dbtype', 'prot'])

    #Checks if DB is properly created; otherwise shutdown
    if os.path.isfile(options.metabolicmodeldir + os.sep +
                      'targetBlastDB.psq') == False:
        logging.exception("error in make_blastDB: blast DB not created")
        #FIXME: don't use sys.exit
        sys.exit(1)
Beispiel #7
0
def convert_to_tabular(tempdir):
    command = [
        "diamond", "view",
        "-a", path.join(tempdir, "matches.daa"),
        "-o", path.join(tempdir, "input.out")
    ]
    return utils.execute(command)
Beispiel #8
0
def check_prereqs(options):
    "Check if all required applications are around"
    failure_messages = []
    for binary_name, optional in _required_binaries:
        if utils.locate_executable(binary_name) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    for hmm in _markov_models:
        hmm = utils.get_full_path(__file__, hmm)
        if utils.locate_file(hmm) is None:
            failure_messages.append("Failed to locate file %r" % hmm)
            continue
        for ext in _binary_extensions:
            binary = "%s%s" % (hmm, ext)
            if utils.locate_file(binary) is None:
                command = ['hmmpress', hmm]
                try:
                    out, err, retcode = utils.execute(command)
                except OSError as e:
                    retcode = 1
                    err = str(e)
                if retcode != 0:
                    failure_messages.append("Failed to hmmpress %r: %r" % (hmm, err))
                break


    return failure_messages
Beispiel #9
0
    def _runEFICAz(self, chunkDir):
        cwd = os.getcwd()
        try:
            os.chdir(chunkDir)
        except OSError:
            logging.exception("Can't chdir to %s" % chunkDir)
            sys.exit(1)

        fastafile = os.path.basename(self.ChunkFilenames[chunkDir])
        ecpredfile = fastafile + ".ecpred"
        # Only perform calculations if result file does not already exist (from previous run)
        if not os.path.isfile(os.path.join(self.basedirName, ecpredfile)):
            EFICAzExecutable = utils.locate_executable(EFICAzBinary)
            if not EFICAzExecutable:
                logging.exception(
                    "EFICAz executable not found, bailing out, analysis not posible"
                )
                sys.exit(1)
            cmdline = [EFICAzExecutable, fastafile]

            logging.debug("executing %s in directory %s" %
                          (" ".join(cmdline), chunkDir))
            try:
                utils.execute(cmdline)
            except:
                logging.exception('cannot execute EFICAz!')
                sys.exit(1)
        else:
            # As this method is executed in an own thread, it does not have the ability to change
            # the variables within th eobject;
            # As a workaround we just copy the "old" file to the tempdir...
            try:
                shutil.copy(
                    os.path.abspath(os.path.join(self.basedirName,
                                                 ecpredfile)),
                    self.ChunkFilenames[chunkDir] + ".ecpred")
            except:
                logging.exception("Could not copy existing eficaz result file %s to tempfile %s", \
                                 os.path.isfile(os.path.abspath(self.basedirName, ecpredfile)), \
                                 self.ChunkFilenames[chunkDir]+".ecpred" )
                sys.exit(1)

        os.chdir(cwd)
def align_ks_domains(reference_alignment, ks_names, ks_seqs, data_dir):
    """Function that aligns a number of query KS domain sequences to the 
    reference alignment of KS domains.
    """
    #Set file names and write query domains to temp input file
    in_temp = os.path.join(os.getcwd(), "in_seq.fasta")
    in_temp_aligned = os.path.join(os.getcwd(), "in_seq_aligned.fasta")
    out_temp = os.path.join(os.getcwd(), "out_seq.fasta")
    alignment_file = os.path.join(os.getcwd(), "aligned.fasta")
    with open(in_temp, "w") as tmp_input:
        for name, seq in zip(ks_names, ks_seqs):
            tmp_input.write("%s\n%s\n" % (name, seq))

    #Generate alignment of query sequences
    muscle_cmd = str(MuscleCommandline(input=in_temp, out=in_temp_aligned))
    out, err, retcode = utils.execute(muscle_cmd.split(" "))
    if retcode == 1:
        logging.error(
            "Alignment of query KS sequences with Muscle failed. Check if Muscle is installed appropriately."
        )
        sys.exit(1)

    #Align the query alignment to the reference alignment using muscle --profile
    muscle_cmd = str(
        MuscleCommandline(profile='True',
                          in1=reference_alignment,
                          in2=in_temp_aligned,
                          out=out_temp))
    out, err, retcode = utils.execute(muscle_cmd.split(" "))
    if retcode == 1:
        logging.error(
            "Alignment of query+reference KS sequences with Muscle failed. Check if Muscle is installed appropriately."
        )
        sys.exit(1)
    else:
        f_temp_input = open(out_temp, 'r').read()
        reformat(input=f_temp_input, out_filename=alignment_file)

    #Remove temporary files
    for f in [in_temp, out_temp]:
        os.remove(f)

    return alignment_file
Beispiel #11
0
def run_diamond(query, target, tempdir, options):
    command = [
        "diamond", "blastp",
        "--db", target,
        "--threads", str(options.cpus),
        "--query", query,
        "--compress", "0",
        "--max-target-seqs", "10000",
        "--evalue", "1e-05",
        "--daa", "matches.daa",
        "--tmpdir", tempdir
    ]
    return utils.execute(command)
Beispiel #12
0
    def test_execute_with_input(self):
        "Test utils.execute() with stdin input"
        expected = """    Called subprocess.Popen(
        ['fake', '--with', 'parameters'],
        stderr=-1,
        stdin=-1,
        stdout=-1)
    Called proc.communicate(input='fake input')"""
        cmd = ['fake', '--with', 'parameters']

        out, err, retcode = utils.execute(cmd, input='fake input')

        self.assertEqual('output', out)
        self.assertEqual('error', err)
        self.assertEqual(0, retcode)
        assert_same_trace(self.tt, expected)
Beispiel #13
0
def run_blastp(target_fasta='', blastp_result='', db_dir='', evalue=1e-30):
    BLASTPprogramName = utils.locate_executable('blastp')
    # Execute blast if output file is not present
    if not os.path.isfile(blastp_result):
        args = [
            BLASTPprogramName, '-query', target_fasta, '-out', blastp_result,
            '-db', db_dir, '-evalue',
            str(evalue), '-outfmt',
            "10 qseqid sseqid evalue score length pident"
        ]
        out, err, retcode = utils.execute(args)
        if retcode != 0:
            logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode)
    else:
        logging.warn("Found blast file %s, skipping new caluclation",
                     blastp_result)
Beispiel #14
0
def run_nrpspredictor(seq_record, nrpsnames, nrpsseqs, options):
    #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor
    with TemporaryDirectory(change=True):
        nrpsseqs_file = "nrpsseqs.fasta"
        NRPSPredictor2_dir = utils.get_full_path(__file__, "NRPSPredictor2")
        utils.writefasta(nrpsnames, nrpsseqs, nrpsseqs_file)
        #Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs
        nrpscodepred.run_nrpscodepred(options)
        #Run NRPSPredictor2 SVM
        datadir = path.join(NRPSPredictor2_dir, 'data')
        libdir = path.join(NRPSPredictor2_dir, 'lib')
        jarfile = path.join(NRPSPredictor2_dir, 'build', 'NRPSpredictor2.jar')
        classpath = [
            jarfile,
            '%s/java-getopt-1.0.13.jar' % libdir,
            '%s/Utilities.jar' % libdir,
            '%s/libsvm.jar' % libdir
        ]
        if sys.platform == ("linux2") or sys.platform == ("darwin"):
            java_separator = ":"
        elif sys.platform == ("win32"):
            java_separator = ";"
        commands = [
            'java',
            '-Ddatadir=%s' % datadir, '-cp',
            java_separator.join(classpath),
            'org.roettig.NRPSpredictor2.NRPSpredictor2', '-i', 'input.sig',
            '-r',
            path.join(
                options.raw_predictions_outputfolder,
                "ctg" + str(options.record_idx) + '_nrpspredictor2_svm.txt'),
            '-s', '1', '-b', options.eukaryotic and '1' or '0'
        ]
        out, err, retcode = utils.execute(commands)
        if err != '':
            logging.debug('running nrpspredictor2 gave error %r' % err)
        #Copy NRPSPredictor results and move back to original directory
        try:
            os.remove(
                path.join(
                    options.raw_predictions_outputfolder, "ctg" +
                    str(options.record_idx) + "_nrpspredictor2_codes.txt"))
        except:
            pass
        shutil.move(
            "ctg" + str(options.record_idx) + "_nrpspredictor2_codes.txt",
            options.raw_predictions_outputfolder)
Beispiel #15
0
def run_diamond(query, target, tempdir, options):
    command = [
        "diamond",
        "blastp",
        "--db",
        target,
        "--threads",
        str(options.cpus),
        "--query",
        query,
        "--compress",
        "0",
        "--max-target-seqs",
        "10000",
        "--evalue",
        "1e-05",
        "--out",
        "input.out",
        "--outfmt",
        "6",  # 6 is blast tabular format, just as in blastp
        "--tmpdir",
        tempdir
    ]
    return utils.execute(command)
Beispiel #16
0
def hmmsearch(fasta,hmm):
    lsname = fastanames(fasta)[0]
    text, err, retcode = utils.execute(["hmmsearch", "--noali", hmm, fasta])
    text = text.replace("\r","\n")
    start = text.find('Domain annotation for each sequence:')
    end = text.find('Internal pipeline statistics summary:')
    lines = []
    ls_names = []
    ls_domain_nrs = []
    ls_starts = []
    ls_ends = []
    ls_scores = []
    ls_evalues = []
    lines = text[start:end].split('\n')
    if "[No targets detected" in text:
        ls_names.append(lsname)
        ls_scores.append(str(0))
    else:
        lines = lines[4:-4]
        for i in lines:
            tabs = i.split(" ")
            tabs2 = []
            for i in tabs:
                if i != "":
                    tabs2.append(i)
            ls_names.append(lsname)
            ls_domain_nrs.append(tabs2[0])
            ls_starts.append(tabs2[6])
            ls_ends.append(tabs2[7])
            ls_scores.append(tabs2[2])
            ls_evalues.append(tabs2[4])
    dicthmm = {}
    for i in ls_names:
        j = ls_names.index(i)
        dicthmm[i] = ls_scores[j]
    return dicthmm
Beispiel #17
0
def run_kr_analysis(infile2, out_file):
    ##Core script
    #Extract activity and stereochemistry signatures from KR domains
    infile = utils.get_full_path(__file__, "KRdomains_muscle.fasta")
    muscle_file = "muscle.fasta"
    dict2 = fastadict(infile2)
    namesb = fastanames(infile2)
    seqsb = fastaseqs(namesb, dict2)
    #startpos = 2
    querysignames = []
    querysigseqs_act = []
    querysigseqs_ste = []
    for i in namesb:
        seq = seqsb[namesb.index(i)]
        querysignames.append(i)
        writefasta([i], [seq], "infile.fasta")
        infile2 = "infile.fasta"
        refsequence = "MAPSI|PKS|CAM00062.1|Erythromycin_synthase_modules_1_and_2|Sacc_KR1"
        namesa = [i]
        #Run muscle and collect sequence positions from file
        utils.execute([
            "muscle", "-profile", "-quiet", "-in1", infile, "-in2", infile2,
            "-out", "muscle.fasta"
        ])
        positions_act = [110, 134, 147, 151]
        positions_ste = [90, 91, 92, 139, 144, 147, 149, 151]
        #Count residues in ref sequence and put positions in list
        muscle_dict = fastadict(muscle_file)
        muscle_seqs = lseqs(muscle_dict)
        muscle_names = lnames(muscle_dict)
        refseqnr = muscle_names.index(refsequence)
        #Extract activity signature
        refseq = muscle_seqs[refseqnr]
        poslist_act = []
        b = 0
        c = 0
        while refseq != "":
            i = refseq[0]
            if c in positions_act and i != "-":
                poslist_act.append(b)
            if i != "-":
                c += 1
            b += 1
            refseq = refseq[1:]
        #Extract stereochemistry signature
        refseq = muscle_seqs[refseqnr]
        poslist_ste = []
        b = 0
        c = 0
        while refseq != "":
            i = refseq[0]
            if c in positions_ste and i != "-":
                poslist_ste.append(b)
            if i != "-":
                c += 1
            b += 1
            refseq = refseq[1:]
        #Extract positions from query sequence
        query = namesa[0]
        query_seqnr = muscle_names.index(query)
        query_seq = muscle_seqs[query_seqnr]
        seq_act = ""
        seq_ste = ""
        for j in poslist_act:
            aa = query_seq[j]
            seq_act = seq_act + aa
        querysigseqs_act.append(seq_act)
        for j in poslist_ste:
            aa = query_seq[j]
            seq_ste = seq_ste + aa
        querysigseqs_ste.append(seq_ste)

    #Check activity
    activitydict = {}
    for i in querysignames:
        querysigseq_act = querysigseqs_act[querysignames.index(i)]
        activity = "inactive"
        if querysigseq_act[0] == "K" and (
                querysigseq_act[1] == "S" or querysigseq_act[1] == "A"
                or querysigseq_act[1] == "G"
        ) and querysigseq_act[2] == "Y" and querysigseq_act[3] == "N":
            activity = "active"
        if querysigseq_act[0] == "E" and (
                querysigseq_act[1] == "S" or querysigseq_act[1] == "A"
                or querysigseq_act[1] == "G"
        ) and querysigseq_act[2] == "H" and querysigseq_act[3] == "H":
            activity = "active"
        if querysigseq_act[0] == "K" and (
                querysigseq_act[1] == "S" or querysigseq_act[1] == "A" or
                querysigseq_act[1] == "G") and querysigseq_act[2] == "Y" and (
                    querysigseq_act[3] == "N" or querysigseq_act[3] == "G"):
            activity = "active"
        activitydict[i] = activity

    #Predict stereochemistry
    stereodict = {}
    for i in querysignames:
        querysigseq_ste = querysigseqs_ste[querysignames.index(i)]
        if querysigseq_ste[0:3] != "LDD" and querysigseq_ste[
                3] == "W" and querysigseq_ste[4] != "H" and querysigseq_ste[
                    5:] == "YAN":
            stereochemistry = "A1"
        elif querysigseq_ste[0:3] != "LDD" and querysigseq_ste[3:] == "WHYAN":
            stereochemistry = "A2"
        elif querysigseq_ste[0:3] == "LDD" and querysigseq_ste[
                5] == "Y" and querysigseq_ste[6] != "P" and querysigseq_ste[
                    7] == "N":
            stereochemistry = "B1"
        elif querysigseq_ste[0:3] == "LDD" and querysigseq_ste[5:] == "YPN":
            stereochemistry = "B2"
        elif querysigseq_ste[5] != "Y":
            stereochemistry = "C1"
        elif querysigseq_ste[5] == "Y" and querysigseq_ste[7] != "N":
            stereochemistry = "C2"
        else:
            stereochemistry = "?"
        stereodict[i] = stereochemistry

    #Output to file
    outfile = open(out_file, "w")
    for i in querysignames:
        outfile.write(i + "\t" + activitydict[i] + "\t" + stereodict[i] + "\n")
Beispiel #18
0
def run_glimmer(seq_record, options):
    "Run glimmer3 to annotate prokaryotic sequences"
    basedir = utils.get_genefinding_basedir(options)
    with TemporaryDirectory(change=True):
        utils.fix_record_name_id(seq_record, options)
        name = seq_record.id
        while len(name) > 0 and name[0] == '-':
            name = name[1:]
        if name == "":
            name = "unknown"
        fasta_file = '%s.fasta' % name
        longorfs_file = '%s.longorfs' % name
        icm_file = '%s.icm' % name
        result_file = '%s.predict' % name

        # run long-orfs
        with open(fasta_file, 'w') as handle:
            seqio.write([seq_record], handle, 'fasta')
        long_orfs = [path.join(basedir, 'long-orfs')]
        long_orfs.extend([
            '-l', '-n', '-t', '1.15', '--trans_table', '11', fasta_file,
            longorfs_file
        ])
        out, err, _ = execute(long_orfs)
        if err.find('ERROR') > -1:
            logging.error("Locating long orfs failed: %r" % err)
            return

        # run extract
        extract = [
            path.join(basedir, 'extract'), '-t', fasta_file, longorfs_file
        ]
        out, err, retcode = execute(extract)
        if out == '':
            logging.error("Failed to extract genes from model, aborting: %r" %
                          err)
            return

        build_icm = [path.join(basedir, 'build-icm'), '-r', icm_file]
        out, err, retcode = execute(build_icm, input=out)
        if err != '':
            logging.error("Failed to build gene model: %r" % err)
            return

        # run glimmer3
        glimmer = [path.join(basedir, 'glimmer3')]
        glimmer.extend([
            '-l', '-o', '50', '-g', '90', '-q', '3000', '-t', '30',
            '--trans_table', '11', fasta_file, icm_file, name
        ])

        out, err, retcode = execute(glimmer)
        if err.find('ERROR') > -1:
            logging.error("Failed to run glimmer3: %r" % err)
            return
        for line in open(result_file, 'r'):
            # skip first line
            if line.startswith('>'):
                continue

            name, start, end, strand, score = line.split()

            try:
                start = int(start)
                end = int(end)
                strand = int(strand)
            except ValueError:
                logging.error('Malformatted glimmer output line %r' %
                              line.rstrip())

            if start > end:
                bpy_strand = -1
                tmp = start
                start = end
                end = tmp
            else:
                bpy_strand = 1

            loc = FeatureLocation(start - 1, end, strand=bpy_strand)
            feature = SeqFeature(location=loc,
                                 id=name,
                                 type="CDS",
                                 qualifiers={
                                     'locus_tag':
                                     ['ctg%s_%s' % (options.record_idx, name)],
                                     'note': ['Glimmer score: %s' % score]
                                 })
            seq_record.features.append(feature)
Beispiel #19
0
def make_blastdb(inputfile, dbname):
    command = [
        "makeblastdb", "-in", inputfile, "-out", dbname, "-dbtype", "prot"
    ]
    utils.execute(command)
Beispiel #20
0
def run_sandpuma(seq_record, nrpsnames, nrpsseqs, options):
    """Run SANDPUMA on the set of NRPS sequences from this genome"""

    nrpspredictor_output = "ctg" + str(
        options.record_idx) + "_nrpspredictor3_svm.txt"
    individual_predictions = "ctg" + str(options.record_idx) + "_ind.res.tsv"
    percentage_identities = "ctg" + str(options.record_idx) + "_pid.res.tsv"
    sandpuma_predictions = "ctg" + str(options.record_idx) + "_sandpuma.tsv"
    ensemble_predictions = "ctg" + str(options.record_idx) + "_ens.res.tsv"

    # In debug mode, simply copy over previous predictions.
    if options.dbgsandpuma != '':
        shutil.copy(
            path.join(options.dbgsandpuma, nrpspredictor_output),
            path.join(options.raw_predictions_outputfolder,
                      nrpspredictor_output))
        shutil.copy(
            path.join(options.dbgsandpuma, individual_predictions),
            path.join(options.raw_predictions_outputfolder,
                      individual_predictions))
        shutil.copy(
            path.join(options.dbgsandpuma, percentage_identities),
            path.join(options.raw_predictions_outputfolder,
                      percentage_identities))
        shutil.copy(
            path.join(options.dbgsandpuma, sandpuma_predictions),
            path.join(options.raw_predictions_outputfolder,
                      sandpuma_predictions))
        shutil.copy(
            path.join(options.dbgsandpuma, ensemble_predictions),
            path.join(options.raw_predictions_outputfolder,
                      ensemble_predictions))
        return

    #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor
    sandpumadir = utils.get_full_path(__file__, "sandpuma")
    with TemporaryDirectory(change=True):
        #Extract A domains from the NRPS sequences and write to FASTA file
        nrpsseqs_file = "input_adomains.fasta"
        utils.writefasta(nrpsnames, nrpsseqs, nrpsseqs_file)
        #Run SANDPUMA on the FASTA file
        sandpuma_command = [
            sandpumadir + os.sep + 'predictnrps_nodep_par.sh',
            'input_adomains.fasta', sandpumadir,
            str(options.cpus)
        ]
        err = utils.execute(sandpuma_command)[1]
        if err != '':
            logging.error('Running SANDPUMA gave an error')
            raise RuntimeError("Sandpuma failed to run: %s" % err)
        #Copy SANDPUMA (including NRPSPredictor2) results and move back to original directory
        shutil.move(
            "query.rep", options.raw_predictions_outputfolder + os.sep +
            nrpspredictor_output)
        shutil.move(
            "ind.res.tsv", options.raw_predictions_outputfolder + os.sep +
            individual_predictions)
        shutil.move(
            "pid.res.tsv", options.raw_predictions_outputfolder + os.sep +
            percentage_identities)
        shutil.move(
            "sandpuma.tsv", options.raw_predictions_outputfolder + os.sep +
            sandpuma_predictions)
        shutil.move(
            "ens.res.tsv", options.raw_predictions_outputfolder + os.sep +
            ensemble_predictions)
Beispiel #21
0
def run_prodigal(seq_record, options):
    "Run progidal to annotate prokaryotic sequences"
    if "prodigal" in options:
        if "basedir" in options.prodigal:
            basedir = options.prodigal.basedir
    else:
        basedir = ""
    with TemporaryDirectory(change=True):
        utils.fix_record_name_id(seq_record, options)
        name = seq_record.id
        while len(name) > 0 and name[0] == '-':
            name = name[1:]
        if name == "":
            name = "unknown"
        fasta_file = '%s.fasta' % name
        result_file = '%s.predict' % name
        with open(fasta_file, 'w') as handle:
            seqio.write([seq_record], handle, 'fasta')

        # run prodigal
        prodigal = [path.join(basedir, 'prodigal')]
        prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file])
        if options.genefinding == "prodigal-m" or len(seq_record.seq) < 20000:
            prodigal.extend(['-p', 'meta'])

        err = execute(prodigal)[1]
        if err.find('Error') > -1:
            logging.error("Failed to run prodigal: %r" % err)
            return
        for line in open(result_file, 'r'):
            # skip first line
            if not line.startswith('>'):
                continue
            name, start, end, prodigalStrand = line[1:].rstrip().split("_")

            try:
                start = int(start)
                end = int(end)
                if prodigalStrand == "+":
                    strand = 1
                else:
                    strand = -1
            except ValueError:
                logging.error('Malformatted prodigal output line %r' %
                              line.rstrip())
                continue

            if start > end:
                strand = -1
                tmp = start
                start = end
                end = tmp

            loc = FeatureLocation(start - 1, end, strand=strand)
            feature = SeqFeature(location=loc,
                                 id=name,
                                 type="CDS",
                                 qualifiers={
                                     'locus_tag':
                                     ['ctg%s_%s' % (options.record_idx, name)]
                                 })
            seq_record.features.append(feature)
Beispiel #22
0
    def _execute_tool(self, analysisResource, fileName=None, stdin_data=None):
        "Perform the external program execution"

        cmdlineList = []

        # Assemble commad line list

        # extract program name from XML
        executeObj = analysisResource.find('./Execute')
        cmdlineList.append(executeObj.attrib['program'])

        # Cycle through parameters in XML
        for parameter in list(analysisResource.findall('./Execute/parameters/parameter')):

            if 'prefix' in parameter.attrib:
                cmdlineList.append(parameter.attrib['prefix'])
            cmdlineList.append(parameter.text)

        # Get database name
        database = analysisResource.find('./Execute/database')
        if 'prefix' in database.attrib:
            cmdlineList.append(database.attrib['prefix'])
        # Add searchpath
        cmdlineList.append(utils.locate_file(path.join(self.options.activeSiteFinderHMMDir, database.text)))

        if fileName:
            # Get (optional) input file prefix (e.g. -query in blast)
            if 'inputfile_prefix' in executeObj.attrib:
                cmdlineList.append(executeObj.attrib['inputfile_prefix'])
            cmdlineList.append(fileName)

        if stdin_data:
            # Get (optional) prefix for stdin (e.g. "-" for hmmpfam / hmmscan
            if 'STDINprefix' in executeObj.attrib:
                cmdlineList.append(executeObj.attrib['STDINprefix'])

        logging.debug("ASF: %s; external program call:\n%s", analysisResource.attrib['name'], " ".join(cmdlineList))

        try:
            if fileName:
                logging.debug("Executing tool with file input")
                out, _, retcode = utils.execute(cmdlineList)
            else:
                logging.debug("Executing tools with STDIN input")
                out, _, retcode = utils.execute(cmdlineList, input=stdin_data)
        except OSError:
            logging.warn('OS error on execution of: %s', " ".join(cmdlineList))
            return []
        if retcode != 0:
            logging.warn('%s returned %s', cmdlineList[0], retcode)
            return []
        res_stream = StringIO(out)
        logging.debug('External program output: %s', res_stream)

        # Get Biopython parser information from XML
        biopython_parser = analysisResource.find('./Execute/BioPythonParser')
        try:
            results = list(SearchIO.parse(res_stream, biopython_parser.text))
        except Exception as e:
            logging.warn('Error parsing results for active site finder analysis: %s ; no hits will be reported', e)
            results = []

        return results
Beispiel #23
0
def run_minowa_cal(infile2, outfile):
        ## Core
        infile = utils.get_full_path(__file__, "CAL_domains_muscle.fasta")
        muscle_file = "muscle.fasta"
        out_file = open(outfile,"w")
        dict2 = fastadict(infile2)
        namesa = fastanames(infile2)
        seqsa = fastaseqs(namesa,dict2)
        startpos = 43
        namesb = namesa
        seqsb = seqsa

        for i in namesb:
            seq = seqsb[namesb.index(i)]
            writefasta([i],[seq],"infile.fasta")
            infile2 = "infile.fasta"
            out_file.write("\\\\" + "\n" + i + "\n")
            refsequence = "Q54297_CAL1"
            namesa = [i]
            seqsa = [seq]

            #Run muscle and collect sequence positions from file
            utils.execute(["muscle", "-profile", "-quiet", "-in1", infile, "-in2", infile2, "-out", "muscle.fasta"])
            file = open(utils.get_full_path(__file__, "CALpositions.txt"),"r")
            text = file.read()
            text = text.replace("\r","\n")
            text = text.strip()
            text = text.replace(' ','_')
            positions = text.split("\t")
            positions2 = []
            for i in positions:
                pos = int(i)
                pos = pos - startpos
                positions2.append(pos)
            positions = positions2

            #Count residues in ref sequence and put positions in list
            muscle_dict = fastadict(muscle_file)
            muscle_seqs = lseqs(muscle_dict)
            muscle_names = lnames(muscle_dict)
            refseqnr = muscle_names.index(refsequence)
            refseq = muscle_seqs[refseqnr]
            poslist = []
            a = 0
            b = 0
            c = 0
            while refseq != "":
                i = refseq[0]
                if c in positions and i != "-":
                    poslist.append(b)
                if i != "-":
                    c += 1
                b += 1
                refseq = refseq[1:]

            #Extract positions from query sequence and create fasta file to use as input for hmm searches
            query = namesa[0]
            query_seqnr = muscle_names.index(query)
            query_seq = muscle_seqs[query_seqnr]
            seq = ""
            for j in poslist:
                aa = query_seq[j]
                if aa == "-":
                    aa = "X"
                seq = seq + aa
            query_names = []
            query_names.append(query)
            query_seqs = []
            query_seqs.append(seq)
            writefasta(query_names,query_seqs,"hmm_infile.fasta")
            #- then use list to extract positions from every sequence -> HMMs (one time, without any query sequence)


            #Compare scores and output prediction
            hmm_names = []
            hmm_scores = []
            cal_hmms_dir = utils.get_full_path(__file__, 'CAL_HMMs')
            hmmresults = hmmsearch("hmm_infile.fasta", path.join(cal_hmms_dir, "Acetyl-CoA.hmm"))
            hmmname = "Acetyl-CoA"
            hmm_names.append(hmmname)
            hmmscore = hmmscores(hmmresults)
            hmm_scores.append(hmmscore[0])
            hmmresults = hmmsearch("hmm_infile.fasta", path.join(cal_hmms_dir, "AHBA.hmm"))
            hmmname = "AHBA"
            hmm_names.append(hmmname)
            hmmscore = hmmscores(hmmresults)
            hmm_scores.append(hmmscore[0])
            hmmresults = hmmsearch("hmm_infile.fasta", path.join(cal_hmms_dir, "fatty_acid.hmm"))
            hmmname = "fatty_acid"
            hmm_names.append(hmmname)
            hmmscore = hmmscores(hmmresults)
            hmm_scores.append(hmmscore[0])
            hmmresults = hmmsearch("hmm_infile.fasta", path.join(cal_hmms_dir, "NH2.hmm"))
            hmmname = "NH2"
            hmm_names.append(hmmname)
            hmmscore = hmmscores(hmmresults)
            hmm_scores.append(hmmscore[0])
            hmmresults = hmmsearch("hmm_infile.fasta", path.join(cal_hmms_dir, "shikimic_acid.hmm"))
            hmmname = "shikimic_acid"
            hmm_names.append(hmmname)
            hmmscore = hmmscores(hmmresults)
            hmm_scores.append(hmmscore[0])

            #Sort names & scores by scores:
            scoredict = {}
            a = 0
            for i in hmm_names:
                score = hmm_scores[a]
                scoredict[i] = float(score)
                a += 1
            hmm_names = sortdictkeysbyvalues(scoredict)
            hmm_scores = []
            for i in hmm_names:
                score = str(scoredict[i])
                hmm_scores.append(score)

            out_file.write("Substrate:")
            out_file.write("\t")
            out_file.write("Score:")
            out_file.write("\n")
            for i in hmm_names:
                out_file.write(i)
                out_file.write("\t")
                j = hmm_names.index(i)
                score = hmm_scores[j]
                out_file.write(score)
                out_file.write("\n")
Beispiel #24
0
def run_minowa_a(infile2, outfile):
    ## Core
    infile = utils.get_full_path(__file__, "A_domains_muscle.fasta")
    muscle_file = "muscle.fasta"
    out_file = open(outfile, "w")
    dict2 = fastadict(infile2)
    namesa = fastanames(infile2)
    seqsa = fastaseqs(namesa, dict2)
    startpos = 65
    namesb = namesa
    seqsb = seqsa

    for i in namesb:
        seq = seqsb[namesb.index(i)]
        writefasta([i], [seq], "infile.fasta")
        infile2 = "infile.fasta"
        out_file.write("\\\\" + "\n" + i + "\n")
        refsequence = "P0C062_A1"
        namesa = [i]
        seqsa = [seq]

        #Run muscle and collect sequence positions from file
        utils.execute([
            "muscle", "-profile", "-quiet", "-in1", infile, "-in2", infile2,
            "-out", "muscle.fasta"
        ])
        file = open(utils.get_full_path(__file__, "Apositions.txt"), "r")
        text = file.read()
        text = text.replace("\r", "\n")
        text = text.strip()
        text = text.replace(' ', '_')
        positions = text.split("\t")
        positions2 = []
        for i in positions:
            pos = int(i)
            pos = pos - startpos
            positions2.append(pos)
        positions = positions2

        #Count residues in ref sequence and put positions in list
        muscle_dict = fastadict(muscle_file)
        muscle_seqs = lseqs(muscle_dict)
        muscle_names = lnames(muscle_dict)
        refseqnr = muscle_names.index(refsequence)
        refseq = muscle_seqs[refseqnr]
        poslist = []
        a = 0
        b = 0
        c = 0
        while refseq != "":
            i = refseq[0]
            if c in positions and i != "-":
                poslist.append(b)
            if i != "-":
                c += 1
            b += 1
            refseq = refseq[1:]

        #Extract positions from query sequence and create fasta file to use as input for hmm searches
        query = namesa[0]
        query_seqnr = muscle_names.index(query)
        query_seq = muscle_seqs[query_seqnr]
        seq = ""
        for j in poslist:
            aa = query_seq[j]
            if aa == "-":
                aa = "X"
            seq = seq + aa
        query_names = []
        query_names.append(query)
        query_seqs = []
        query_seqs.append(seq)
        writefasta(query_names, query_seqs, "hmm_infile.fasta")
        #- then use list to extract positions from every sequence -> HMMs (one time, without any query sequence)

        #Compare scores and output prediction
        hmm_names = []
        hmm_scores = []
        a_hmms_dir = utils.get_full_path(__file__, 'A_HMMs')
        hmmresults = hmmsearch(
            "hmm_infile.fasta",
            path.join(a_hmms_dir, "2-3-diaminopropionate.hmm"))
        hmmname = "2-3-diaminoproprionate"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "3mGlu.hmm"))
        hmmname = "3mGlu"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "5NhOrn.hmm"))
        hmmname = "5NhOrn"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Abu.hmm"))
        hmmname = "Abu"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Ahp.hmm"))
        hmmname = "Ahp"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "alaninol.hmm"))
        hmmname = "alaninol"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Asn.hmm"))
        hmmname = "Asn"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Asp.hmm"))
        hmmname = "Asp"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "beta-Lys.hmm"))
        hmmname = "beta-Lys"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "bOHTyr.hmm"))
        hmmname = "bOHTyr"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Cys.hmm"))
        hmmname = "Cys"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "DHB.hmm"))
        hmmname = "DHB"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "fOHOrn.hmm"))
        hmmname = "fOHOrn"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Glu.hmm"))
        hmmname = "Glu"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch(
            "hmm_infile.fasta",
            path.join(a_hmms_dir, "guanidinoacetic_acid.hmm"))
        hmmname = "guanidinoacetic_acid"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "His.hmm"))
        hmmname = "His"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Hpg2Cl.hmm"))
        hmmname = "Hpg2Cl"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Ile.hmm"))
        hmmname = "Ile"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Leu.hmm"))
        hmmname = "Leu"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "MeAsp.hmm"))
        hmmname = "MeAsp"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "OHOrn.hmm"))
        hmmname = "OHOrn"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Orn.hmm"))
        hmmname = "Orn"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Phenylacetate.hmm"))
        hmmname = "Phenylacetate"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Pro.hmm"))
        hmmname = "Pro"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Qna.hmm"))
        hmmname = "Qna"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Sar.hmm"))
        hmmname = "Sar"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Thr-4-Cl.hmm"))
        hmmname = "Thr-4-Cl"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Trp.hmm"))
        hmmname = "Trp"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Val.hmm"))
        hmmname = "Val"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "3-HPA.hmm"))
        hmmname = "3-HPA"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "4-MHA.hmm"))
        hmmname = "4-MHA"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Aad.hmm"))
        hmmname = "Aad"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Aeo.hmm"))
        hmmname = "Aeo"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Ala.hmm"))
        hmmname = "Ala"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Arg.hmm"))
        hmmname = "Arg"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "B-Ala.hmm"))
        hmmname = "B-Ala"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Bmt.hmm"))
        hmmname = "Bmt"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "capreomycidine.hmm"))
        hmmname = "capreomycidine"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Dab.hmm"))
        hmmname = "Dab"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "DHpg.hmm"))
        hmmname = "DHpg"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Gln.hmm"))
        hmmname = "Gln"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Gly.hmm"))
        hmmname = "Gly"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "hAsn.hmm"))
        hmmname = "hAsn"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "homoTyr.hmm"))
        hmmname = "homoTyr"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Hpg.hmm"))
        hmmname = "Hpg"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Kyn.hmm"))
        hmmname = "Kyn"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Lys.hmm"))
        hmmname = "Lys"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "mPro.hmm"))
        hmmname = "mPro"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "OmAsp.hmm"))
        hmmname = "OmAsp"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Phe.hmm"))
        hmmname = "Phe"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "pipecolate.hmm"))
        hmmname = "pipecolate"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "QA.hmm"))
        hmmname = "QA"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Sal.hmm"))
        hmmname = "Sal"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Ser.hmm"))
        hmmname = "Ser"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Thr.hmm"))
        hmmname = "Thr"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])
        hmmresults = hmmsearch("hmm_infile.fasta",
                               path.join(a_hmms_dir, "Tyr.hmm"))
        hmmname = "Tyr"
        hmm_names.append(hmmname)
        hmmscore = hmmscores(hmmresults)
        hmm_scores.append(hmmscore[0])

        #Sort names & scores by scores:
        scoredict = {}
        a = 0
        for i in hmm_names:
            score = hmm_scores[a]
            scoredict[i] = float(score)
            a += 1
        hmm_names = sortdictkeysbyvalues(scoredict)
        hmm_scores = []
        for i in hmm_names:
            score = str(scoredict[i])
            hmm_scores.append(score)

        out_file.write("Substrate:")
        out_file.write("\t")
        out_file.write("Score:")
        out_file.write("\n")
        for i in hmm_names:
            out_file.write(i)
            out_file.write("\t")
            j = hmm_names.index(i)
            score = hmm_scores[j]
            out_file.write(score)
            out_file.write("\n")
        out_file.write("\n")
Beispiel #25
0
def run_pkssignature_analysis(infile2, outfile):
    ##Core script
    #Extract PKS signature from AT domains
    infile = utils.get_full_path(__file__, "AT_domains_muscle.fasta")
    muscle_file = "muscle.fasta"
    dict2 = fastadict(infile2)
    namesb = fastanames(infile2)
    seqsb = fastaseqs(namesb, dict2)
    startpos = 7
    querysignames = []
    querysigseqs = []
    for i in namesb:
        seq = seqsb[namesb.index(i)]
        querysignames.append(i)
        writefasta([i], [seq], "infile.fasta")
        infile2 = "infile.fasta"
        refsequence = "P0AAI9_AT1"
        namesa = [i]
        #Run muscle and collect sequence positions from file
        utils.execute([
            "muscle", "-profile", "-quiet", "-in1", infile, "-in2", infile2,
            "-out", "muscle.fasta"
        ])
        file = open(utils.get_full_path(__file__, "ATpositions.txt"), "r")
        text = file.read()
        text = text.strip()
        text = text.replace(' ', '_')
        positions = text.split("\t")
        positions2 = []
        for i in positions:
            pos = int(i)
            pos = pos - startpos
            positions2.append(pos)
        positions = positions2
        #Count residues in ref sequence and put positions in list
        muscle_dict = fastadict(muscle_file)
        muscle_seqs = lseqs(muscle_dict)
        muscle_names = lnames(muscle_dict)
        refseqnr = muscle_names.index(refsequence)
        refseq = muscle_seqs[refseqnr]
        poslist = []
        a = 0
        b = 0
        c = 0
        while refseq != "":
            i = refseq[0]
            if c in positions and i != "-":
                poslist.append(b)
            if i != "-":
                c += 1
            b += 1
            refseq = refseq[1:]
        #Extract positions from query sequence
        query = namesa[0]
        query_seqnr = muscle_names.index(query)
        query_seq = muscle_seqs[query_seqnr]
        seq = ""
        for j in poslist:
            aa = query_seq[j]
            seq = seq + aa
        querysigseqs.append(seq)

    #Load reference PKS signatures
    infile3 = utils.get_full_path(__file__, "pks_signatures.fasta")
    signaturesdict = fastadict(infile3)
    signaturenames = fastanames(infile3)
    signatureseqs = fastaseqs(signaturenames, signaturesdict)

    out_file = open(outfile, "w")
    #Compare PKS signature with database of signatures and write output to txt file
    for k in querysignames:
        querysigseq = querysigseqs[querysignames.index(k)]
        scoredict = {}
        for i in signaturenames:
            sigseq = signatureseqs[signaturenames.index(i)]
            positions = range(len(querysigseq))
            score = 0
            for j in positions:
                if querysigseq[j] == sigseq[j]:
                    score += 1
            score = ((float(score) / 24) * 100)
            scoredict[i] = score
        sortedhits = sortdictkeysbyvalues(scoredict)
        sortedhits = sortedhits[:10]
        sortedscores = []
        sortedhits2 = []
        for i in sortedhits:
            score = scoredict[i]
            if score > 50:
                score = "%.0f" % (score)
                sortedscores.append(score)
                sortedhits2.append(i)
        sortedhits = sortedhits2
        #Write output to txt file
        out_file.write("//\n" + k + "\t" + querysigseq + "\n")
        a = 0
        for i in sortedhits:
            out_file.write(i + "\t" + signatureseqs[signaturenames.index(i)] +
                           "\t" + sortedscores[a] + "\n")
            a += 1
        out_file.write("\n\n")
Beispiel #26
0
def run_glimmerhmm(seq_record, options):
    basedir = utils.get_genefinding_basedir(options)
    with TemporaryDirectory(change=True):
        #Write FASTA file and run GlimmerHMM
        utils.fix_record_name_id(seq_record, options)
        name = seq_record.id
        while len(name) > 0 and name[0] == '-':
            name = name[1:]
        if name == "":
            name = "unknown"
        fasta_file = '%s.fasta' % name
        result_file = '%s.predict' % name
        with open(fasta_file, 'w') as handle:
            seqio.write([seq_record], handle, 'fasta')
        glimmerhmm = ['glimmerhmm']
        glimmerhmm.extend([
            fasta_file,
            utils.get_full_path(__file__,
                                "train_%s" % options.glimmerhmm_train_folder),
            "-g"
        ])
        out, err, retcode = execute(glimmerhmm)
        if err.find('ERROR') > -1:
            logging.error("Failed to run GlimmerHMM: %r" % err)
            return

        #Parse GlimmerHMM predictions
        resultstext = out
        if "CDS" not in resultstext:
            logging.error("GlimmerHMM gene prediction failed: no genes found.")
        resultstext = resultstext.replace("\r", " ")
        lines = resultstext.split("\n")
        lines = lines[2:-1]
        orfnames = []
        positions = []
        strands = []
        x = 0
        orfnr = 0
        starts = []
        ends = []
        for line in lines:
            columns = line.split("\t")
            if len(columns) > 1:
                if x == 0:
                    if columns[6] == "+":
                        bpy_strand = 1
                    else:
                        bpy_strand = -1
                    if "mRNA" not in line:
                        starts.append(int(columns[3]))
                        ends.append(int(columns[4]))
                elif x == (len(lines) - 1) or "mRNA" in lines[x + 1]:
                    if columns[6] == "+":
                        bpy_strand = 1
                    else:
                        bpy_strand = -1
                    strands.append(bpy_strand)
                    starts.append(int(columns[3]))
                    ends.append(int(columns[4]))
                    orfnames.append("orf" + (5 - orfnr) * "0" + str(orfnr))
                    orfnr += 1
                    if len(starts) == 1:
                        if starts[0] == 0:
                            starts[0] = 1
                        if ends[0] == 0:
                            ends[0] = 1
                        positions.append([[starts[0] - 1, ends[0]]])
                    else:
                        pos = []
                        if bpy_strand == -1:
                            starts.reverse()
                            ends.reverse()
                        for i in starts:
                            if i == 0:
                                i = 1
                            if ends[starts.index(i)] == 0:
                                ends[starts.index(i)] = 1
                            pos.append([i - 1, ends[starts.index(i)]])
                        positions.append(pos)
                    starts = []
                    ends = []
                elif "mRNA" not in line:
                    starts.append(int(columns[3]))
                    ends.append(int(columns[4]))
            x += 1
        if len(orfnames) == 0:
            logging.error("GlimmerHMM gene prediction failed. Please check the " \
                "format of your input FASTA file.")
        #Create seq_record features for identified genes
        idx = 0
        for orfname in orfnames:
            bpy_strand = strands[idx]
            genepositions = positions[idx]
            #For genes with only one CDS
            if len(genepositions) == 1:
                gstart, gend = genepositions[0]
                loc = FeatureLocation(gstart, gend, strand=bpy_strand)
                feature = SeqFeature(
                    location=loc,
                    id=orfname,
                    type="CDS",
                    qualifiers={
                        'locus_tag':
                        ['ctg%s_%s' % (options.record_idx, orfname)]
                    })
                seq_record.features.append(feature)
            #For genes with multiple exons
            else:
                gstart, gend = min(genepositions[0]), max(genepositions[-1])
                sublocations = []
                for exonstart, exonend in genepositions:
                    exonloc = FeatureLocation(exonstart,
                                              exonend,
                                              strand=bpy_strand)
                    sublocations.append(exonloc)
                loc = CompoundLocation(sublocations)
                feature = SeqFeature(
                    location=loc,
                    id=orfname,
                    type="CDS",
                    qualifiers={
                        'locus_tag':
                        ['ctg%s_%s' % (options.record_idx, orfname)]
                    })
                seq_record.features.append(feature)
            idx += 1