Esempio n. 1
0
def GetLocDef(predfile):  #{{{
    """
    Read in LocDef and its corresponding score from the subcons prediction file
    """
    content = ""
    if os.path.exists(predfile):
        content = myfunc.ReadFile(predfile)

    loc_def = None
    loc_def_score = None
    if content != "":
        lines = content.split("\n")
        if len(lines) >= 2:
            strs0 = lines[0].split("\t")
            strs1 = lines[1].split("\t")
            strs0 = [x.strip() for x in strs0]
            strs1 = [x.strip() for x in strs1]
            if len(strs0) == len(strs1) and len(strs0) > 2:
                if strs0[1] == "LOC_DEF":
                    loc_def = strs1[1]
                    dt_score = {}
                    for i in xrange(2, len(strs0)):
                        dt_score[strs0[i]] = strs1[i]
                    if loc_def in dt_score:
                        loc_def_score = dt_score[loc_def]

    return (loc_def, loc_def_score)
Esempio n. 2
0
def GetAverageNewRunTime(finished_seq_file, window=100):  #{{{
    """Get average running time of the newrun tasks for the last x number of
sequences
    """
    logger = logging.getLogger(__name__)
    avg_newrun_time = -1.0
    if not os.path.exists(finished_seq_file):
        return avg_newrun_time
    else:
        indexmap_content = myfunc.ReadFile(finished_seq_file).split("\n")
        indexmap_content = indexmap_content[::-1]
        cnt = 0
        sum_run_time = 0.0
        for line in indexmap_content:
            strs = line.split("\t")
            if len(strs) >= 7:
                source = strs[4]
                if source == "newrun":
                    try:
                        sum_run_time += float(strs[5])
                        cnt += 1
                    except:
                        logger.debug(
                            "bad format in finished_seq_file (%s) with line \"%s\""
                            % (finished_seq_file, line))
                        pass

                if cnt >= window:
                    break

        if cnt > 0:
            avg_newrun_time = sum_run_time / float(cnt)
        return avg_newrun_time
def RunJob(infile, outpath, tmpdir, email, jobid, g_params):  #{{{
    blastdb = "/data3/data/blastdb/swissprot"
    rootname = os.path.basename(os.path.splitext(infile)[0])
    starttagfile = "%s/runjob.start" % (outpath)
    runjob_errfile = "%s/runjob.err" % (outpath)
    runjob_logfile = "%s/runjob.log" % (outpath)
    finishtagfile = "%s/runjob.finish" % (outpath)
    tmp_outfile = "%s/query.result" % (tmpdir)
    resultpathname = jobid
    outpath_result = "%s/%s" % (outpath, resultpathname)
    outfile = "%s/query.result" % (outpath_result)
    tarball = "%s.tar.gz" % (resultpathname)
    tarball_fullpath = "%s.tar.gz" % (outpath_result)
    isOK = True
    try:
        os.makedirs(outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False

    if isOK:
        g_params['runjob_log'].append("tmpdir = %s" % (tmpdir))
        #cmd = [script_getseqlen, infile, "-o", tmp_outfile , "-printid"]
        datetime = time.strftime("%Y-%m-%d %H:%M:%S")
        rt_msg = myfunc.WriteFile(datetime, starttagfile)
        if rt_msg:
            g_params['runjob_err'].append(rt_msg)

        cmd = [
            blastall, "-i", infile, "-p", "blastp", "-o", tmp_outfile, "-d",
            blastdb
        ]

        g_params['runjob_log'].append(" ".join(cmd))
        try:
            myfunc.check_output(cmd)
        except subprocess.CalledProcessError, e:
            g_params['runjob_err'].append(str(e))
            suqoutfilelist = glob.glob("%s/*.sh.*.out" % (tmpdir))
            if len(suqoutfilelist) > 0:
                suqoutfile = suqoutfilelist[0]
            g_params['runjob_err'].append(myfunc.ReadFile(suqoutfile))

        if os.path.exists(tmp_outfile):
            cmd = ["cp", "-f", tmp_outfile, outfile]
            try:
                myfunc.check_output(cmd)
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e))
Esempio n. 4
0
def GetRunTimeFromTimeFile(timefile, keyword=""):  # {{{
    runtime = 0.0
    if os.path.exists(timefile):
        lines = myfunc.ReadFile(timefile).split("\n")
        for line in lines:
            if keyword == "" or (keyword != "" and line.find(keyword) != -1):
                ss2 = line.split(";")
                try:
                    runtime = float(ss2[1])
                    if keyword == "":
                        break
                except:
                    runtime = 0.0
                    pass
    return runtime
Esempio n. 5
0
def main(args, g_params):
    parser = argparse.ArgumentParser(
        description='TOPCONS2_OCTOPUS workflow master script',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''\
Created 2015-05-05, updated 2018-02-16, Peters Christoph and Nanjiang Shu

Examples:
''')
    parser.add_argument(
        'inFile',
        metavar='inFile',
        help='Specify the input amino acid sequence file in FASTA format')
    parser.add_argument('out_path',
                        metavar='out_path',
                        help='Specify the outpath for result')
    parser.add_argument(
        'blastDir',
        metavar='blastDir',
        help='Specify the path for psiblast, which contains bin/blastpgp')
    parser.add_argument(
        'blastDB',
        metavar='blastDB',
        help='Specify the name of the blastdb, including the path')
    parser.add_argument(
        '-tmpdir',
        '--tmpdir',
        metavar='DIR',
        dest='TMPPATH',
        help=
        'Specify the directory where the temporary files will be written to')
    parser.add_argument('-debug',
                        '--debug',
                        action='store_true',
                        default=False,
                        dest='isDEBUG',
                        help='Output debug info')
    parser.add_argument('-RM',
                        '--remove-individual-files',
                        action='store_true',
                        default=False,
                        dest='isRemoveFile',
                        help='Delete result for individual sequences')

    args = parser.parse_args()

    g_params['DEBUG'] = args.isDEBUG
    g_params['REMOVE_IND_FILES'] = args.isRemoveFile
    inFile = os.path.abspath(args.inFile)
    out_path = os.path.abspath(args.out_path)
    blastDir = os.path.abspath(args.blastDir)
    blastDB = os.path.abspath(args.blastDB)
    if args.TMPPATH != None:
        g_params['TMPPATH'] = os.path.abspath(args.TMPPATH)

    if not os.access(g_params['TMPPATH'], os.W_OK):
        print >> sys.stderr, "Error. TMPPATH '%s' not writable. Exit." % (
            g_params['TMPPATH'])
        return 1
    if not os.access(out_path, os.W_OK):
        print >> sys.stderr, "Error. out_path '%s' not writable. Exit." % (
            out_path)
        return 1

    os.environ['TMPPATH'] = g_params['TMPPATH']

    DEBUG = g_params['DEBUG']
    TMPPATH = g_params['TMPPATH']
    if not os.path.exists(inFile):
        print >> sys.stderr, "inFile %s does not exist. Exit." % (inFile)
        sys.exit(1)
    if not os.path.exists(out_path):
        try:
            os.makedirs(out_path)
        except OSError:
            print >> sys.stderr, "Failed to create out_path %s. Exit." % (
                out_path)
            sys.exit(1)

    if not "BLASTDB" in os.environ:  # this fixed the warning message of unset 'BLASTDB'
        try:
            blastdbpath = os.path.realpath(os.path.dirname(blastDB))
            os.environ['BLASTDB'] = blastdbpath
        except:
            pass

    # Set the working dir to the script location
    my_path = module_locator.module_path()
    os.chdir(my_path)
    inFile_rootname = os.path.basename(os.path.splitext(inFile)[0])

    # Timing remove from final version
    #print "Timing remove from final version"
    timingfile = "%s/%s" % (out_path, "time.txt")
    topfile_OCTOPUS = "%s/%s.OCTOPUS.topfa" % (out_path, inFile_rootname)
    topfile_SPOCTOPUS = "%s/%s.SPOCTOPUS.topfa" % (out_path, inFile_rootname)
    fpout_OCTOPUS = open(topfile_OCTOPUS, "w")
    fpout_SPOCTOPUS = open(topfile_SPOCTOPUS, "w")
    with open(timingfile, "w") as timingFileOut:
        with open(inFile, "rU") as seqFile:
            for index, entry in enumerate(list(SeqIO.parse(seqFile, "fasta"))):
                # Timing remove from final version
                #                 print "Timing remove from final version"
                start = time.time()

                #Create folders for tmp data and output
                used_pfam = "pfam"
                tmpDir = tempfile.mkdtemp(prefix="%s/seq_" %
                                          (TMPPATH) + str(index) + "_") + "/"
                os.chmod(tmpDir, 0755)
                tmpDir_pfam = tmpDir
                tmpDir_cdd = ""
                tmpDir_uniref = ""

                protnamefile = "%s/query.fa.txt" % (tmpDir)
                try:
                    fpout = open(protnamefile, "w")
                    print >> fpout, "query"
                    fpout.close()
                except IOError:
                    print >> sys.stderr, "Failed to write to protnamefile %s. "\
                            "Exit."%(protnamefile)
                    sys.exit(1)

                outDir = "%s%s%s/" % (out_path, os.sep, "seq_%d" % (index))
                if os.path.exists(tmpDir) is False:
                    os.mkdir(tmpDir)

                if os.path.exists(outDir) is False:
                    os.mkdir(outDir)


#                 if os.path.exists(outDir + "Topcons/") is False:
#                     os.mkdir(outDir + "Topcons/")

#                 outfile = "%s/%s"%(tmpDir, "query.fa")
                with open(tmpDir + "query.fa", "w") as outFile:
                    outFile.write(">query" + "\n" + str(entry.seq))

                with open(outDir + "seq.fa", "w") as outFile:
                    outFile.write(">query" + "\n" + str(entry.seq))

                startDir = os.getcwd()

                # At the same time the profiles can be created
                cmd = ["./fa2prfs_pfamscan_v2.sh", tmpDir_pfam, blastDir]
                cmdline = " ".join(cmd)
                rmsg = ""
                try:
                    print "cmdline: ", cmdline
                    rmsg = subprocess.check_call(cmd, stderr=subprocess.STDOUT)
                except subprocess.CalledProcessError, e:
                    print "errmsg:", e
                    print "rmsg:", rmsg
                    pass
                query_seqdbfile = "%s/%s" % (tmpDir_pfam, "query.hits.db")
                filesize = 0
                try:
                    filesize = os.path.getsize(query_seqdbfile)
                except OSError:
                    filesize = -1
                    pass
                if DEBUG:
                    print "After fa2prfs_pfamscan_v2.sh filesize(%s)=%d" % (
                        query_seqdbfile, filesize)

                # In case we do not find a hit, we have to run hmmscan on the cdd database
                if filesize <= 0:
                    tmpDir_cdd = tempfile.mkdtemp(prefix="%s/seq_cdd_" %
                                                  (TMPPATH) + str(index) +
                                                  "_") + "/"
                    os.chmod(tmpDir_cdd, 0755)
                    with open(tmpDir_cdd + "query.fa", "w") as outFile:
                        outFile.write(">query" + "\n" + str(entry.seq))
                    used_pfam = "cdd"
                    cmd = ["./fa2prfs_hmmscan.sh", tmpDir_cdd, blastDir]
                    cmdline = " ".join(cmd)
                    try:
                        print "\ncmdline:", cmdline
                        rmsg = subprocess.check_call(cmd,
                                                     stderr=subprocess.STDOUT)
                    except subprocess.CalledProcessError, e:
                        print "errmsg:", e
                        print "rmsg:", rmsg
                        pass

                    tmpDir = tmpDir_cdd

                    query_seqdbfile = "%s/%s" % (tmpDir_cdd, "query.hits.db")
                    try:
                        filesize = os.path.getsize(query_seqdbfile)
                    except OSError:
                        filesize = -1
                        pass

                    if DEBUG:
                        print "After fa2prfs_hmmscan.sh filesize(%s)=%d" % (
                            query_seqdbfile, filesize)
                # In case we do not find a hit, we have to run the old script
                if filesize <= 0:
                    tmpDir_uniref = tempfile.mkdtemp(prefix="%s/seq_uniref_" %
                                                     (TMPPATH) + str(index) +
                                                     "_") + "/"
                    os.chmod(tmpDir_uniref, 0755)
                    with open(tmpDir_uniref + "query.fa", "w") as outFile:
                        outFile.write(">query" + "\n" + str(entry.seq))
                    used_pfam = "uniref"
                    cmd = [
                        "./fa2prfs_fallback_v2.sh", tmpDir_uniref, blastDir,
                        blastDB
                    ]
                    cmdline = " ".join(cmd)
                    try:
                        print "\ncmdline:", cmdline
                        rmsg = subprocess.check_call(cmd,
                                                     stderr=subprocess.STDOUT)
                    except subprocess.CalledProcessError, e:
                        print e
                        print rmsg
                        pass
                    tmpDir = tmpDir_uniref

                    query_seqdbfile = "%s/%s" % (tmpDir_uniref,
                                                 "query.hits.db")
                    try:
                        filesize = os.path.getsize(query_seqdbfile)
                    except OSError:
                        filesize = -1
                        pass

                    if DEBUG:
                        print "After fa2prfs_fallback_v2.sh filesize(%s)=%d" % (
                            query_seqdbfile, filesize)

                # Once the profile is created start all other predictors

                os.chdir(os.path.abspath("../predictors/spoctopus/"))
                outDir_SPOCTOPUS = outDir + os.sep + "SPOCTOPUS"
                if not os.path.exists(outDir_SPOCTOPUS):
                    os.makedirs(outDir_SPOCTOPUS)
                cmd = [
                    "./SPOCTOPUS.sh", protnamefile, tmpDir + "PSSM_PRF_FILES/",
                    tmpDir + "RAW_PRF_FILES/", outDir_SPOCTOPUS, "-N"
                ]  #output also the ANN result for SPOCTOPUS, changed 2016-01-26
                cmdline = " ".join(cmd)
                if DEBUG:
                    print "cmdline:", cmdline
                p_spoctopus = subprocess.Popen(cmd)
                os.chdir(startDir)

                os.chdir(os.path.abspath("../predictors/spoctopus/"))
                outDir_OCTOPUS = outDir + os.sep + "OCTOPUS"
                if not os.path.exists(outDir_OCTOPUS):
                    os.makedirs(outDir_OCTOPUS)
                cmd = [
                    "./OCTOPUS.sh", protnamefile, tmpDir + "PSSM_PRF_FILES/",
                    tmpDir + "RAW_PRF_FILES/", outDir_OCTOPUS, "-N"
                ]  #output also the ANN result for OCTOPUS, changed 2016-01-26
                cmdline = " ".join(cmd)
                if DEBUG:
                    print "cmdline:", cmdline

                p_octopus = subprocess.Popen(cmd)
                os.chdir(startDir)

                p_spoctopus.communicate()  #now wait for OCTOPUS
                p_octopus.communicate()  #now wait for SPOCTOPUS
                count_pred = 2

                end = time.time()
                lines = 0
                with open(tmpDir + "query.hits.db") as inFile:
                    for line in inFile:
                        if line.find(">") == -1:
                            lines += 1
                timingFileOut.write(
                    str(entry.id) + ";" + str(end - start) + ";" + used_pfam +
                    ";" + str(lines) + ";" + str(count_pred) + "\n")
                #Remove the tmpFolder

                if not DEBUG:  #debugging
                    if os.path.exists(tmpDir) is True:
                        p = subprocess.call(["rm", "-rf", tmpDir])
                    if os.path.exists(tmpDir_cdd) is True:
                        p = subprocess.call(["rm", "-rf", tmpDir_cdd])
                    if os.path.exists(tmpDir_uniref) is True:
                        p = subprocess.call(["rm", "-rf", tmpDir_uniref])
                    if os.path.exists(tmpDir_pfam) is True:
                        p = subprocess.call(["rm", "-rf", tmpDir_pfam])
                else:
                    print "tmpDir=%s" % (tmpDir)

                p = subprocess.call(["python", "correct_Topo.py", outDir])

                topfile = "%s/%s/%s" % (outDir, "OCTOPUS", "query.top")
                if os.path.exists(topfile):
                    top = myfunc.ReadFile(topfile).strip()
                    if top:
                        fpout_OCTOPUS.write(">%s\n" % (entry.description))
                        fpout_OCTOPUS.write("%s\n" % (top))

                topfile = "%s/%s/%s" % (outDir, "SPOCTOPUS", "query.top")
                if os.path.exists(topfile):
                    top = myfunc.ReadFile(topfile).strip()
                    if top:
                        fpout_SPOCTOPUS.write(">%s\n" % (entry.description))
                        fpout_SPOCTOPUS.write("%s\n" % (top))

                if g_params['REMOVE_IND_FILES']:
                    shutil.rmtree(outDir)
Esempio n. 6
0
def WriteTextResultFile(outfile, maplist, runtime_in_sec):#{{{
    try:
        outpath_result = os.path.dirname(outfile)
        methodlist = ['TOPCONS', 'OCTOPUS', 'Philius', 'PolyPhobius', 'SCAMPI', 'SPOCTOPUS']
        fpout = open(outfile, "w")
        date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print >> fpout, "##############################################################################"
        print >> fpout, "TOPCONS2 result file"
        print >> fpout, "Generated from http://%s at %s"%(g_params['base_www_url'], date)
        print >> fpout, "Total request time: %.1f seconds."%(runtime_in_sec)
        print >> fpout, "##############################################################################"
        cnt = 0
        for line in maplist:
            strs = line.split('\t')
            subfoldername = strs[0]
            length = int(strs[1])
            desp = strs[2]
            seq = strs[3]
            print >> fpout, "Sequence number: %d"%(cnt+1)
            print >> fpout, "Sequence name: %s"%(desp)
            print >> fpout, "Sequence length: %d aa."%(length)
            print >> fpout, "Sequence:\n%s\n\n"%(seq)

            for i in xrange(len(methodlist)):
                method = methodlist[i]
                if method == "TOPCONS":
                    topfile = "%s/%s/%s/topcons.top"%(outpath_result, subfoldername, "Topcons")
                elif method == "Philius":
                    topfile = "%s/%s/%s/query.top"%(outpath_result, subfoldername, "philius")
                elif method == "SCAMPI":
                    topfile = "%s/%s/%s/query.top"%(outpath_result, subfoldername, method+"_MSA")
                else:
                    topfile = "%s/%s/%s/query.top"%(outpath_result, subfoldername, method)
                if os.path.exists(topfile):
                    top = myfunc.ReadFile(topfile)
                else:
                    top = ""
                if top == "":
                    top = "***No topology could be produced with this method topfile=%s***"%(topfile)

                print >> fpout, "%s predicted topology:\n%s\n\n"%(method, top)


            dgfile = "%s/%s/dg.txt"%(outpath_result, subfoldername)
            dg_content = myfunc.ReadFile(dgfile)
            lines = dg_content.split("\n")
            dglines = []
            for line in lines:
                if line and line[0].isdigit():
                    dglines.append(line)
            if len(dglines)>0:
                print >> fpout,  "\nPredicted Delta-G-values (kcal/mol) "\
                        "(left column=sequence position; right column=Delta-G)\n"
                print >> fpout, "\n".join(dglines)

            reliability_file = "%s/%s/Topcons/reliability.txt"%(outpath_result, subfoldername)
            reliability = myfunc.ReadFile(reliability_file)
            if reliability != "":
                print >> fpout, "\nPredicted TOPCONS reliability (left "\
                        "column=sequence position; right column=reliability)\n"
                print >> fpout, reliability
            print >> fpout, "##############################################################################"
            cnt += 1



    except IOError:
        print "Failed to write to file %s"%(outfile)
Esempio n. 7
0
def RunJob(infile, outpath, tmpdir, email, jobid, g_params):#{{{
    rootname = os.path.basename(os.path.splitext(infile)[0])
    starttagfile   = "%s/runjob.start"%(outpath)
    runjob_errfile = "%s/runjob.err"%(outpath)
    runjob_logfile = "%s/runjob.log"%(outpath)
    finishtagfile = "%s/runjob.finish"%(outpath)
    rmsg = ""


    resultpathname = jobid

    outpath_result = "%s/%s"%(outpath, resultpathname)
    tarball = "%s.tar.gz"%(resultpathname)
    zipfile = "%s.zip"%(resultpathname)
    tarball_fullpath = "%s.tar.gz"%(outpath_result)
    zipfile_fullpath = "%s.zip"%(outpath_result)
    outfile = "%s/%s/Topcons/topcons.top"%(outpath_result, "seq_%d"%(0))
    resultfile_text = "%s/%s"%(outpath_result, "query.result.txt")

    tmp_outpath_result = "%s/%s"%(tmpdir, resultpathname)
    isOK = True
    try:
        os.makedirs(tmp_outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s"%(tmp_outpath_result)
        myfunc.WriteFile(msg+"\n", runjob_errfile, "a")
        isOK = False

    print "isOK =", isOK

    if isOK:
        tmp_mapfile = "%s/seqid_index_map.txt"%(tmp_outpath_result)

        maplist = []
        maplist_simple = []
        hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0)
        if hdl.failure:
            isOK = False
        else:
            recordList = hdl.readseq()
            cnt = 0
            while recordList != None:
                for rd in recordList:
                    maplist.append("%s\t%d\t%s\t%s"%("seq_%d"%cnt, len(rd.seq),
                        rd.description, rd.seq))
                    maplist_simple.append("%s\t%d\t%s"%("seq_%d"%cnt, len(rd.seq),
                        rd.description))
                    cnt += 1
                recordList = hdl.readseq()
            hdl.close()
        myfunc.WriteFile("\n".join(maplist_simple), tmp_mapfile)

        if isOK:
#             g_params['runjob_log'].append("tmpdir = %s"%(tmpdir))
            #cmd = [script_getseqlen, infile, "-o", tmp_outfile , "-printid"]
            datetime = time.strftime("%Y-%m-%d %H:%M:%S")
            rt_msg = myfunc.WriteFile(datetime, starttagfile)
            if rt_msg:
                g_params['runjob_err'].append(rt_msg)

            cmd = [runscript, infile,  tmp_outpath_result, blastdir, blastdb ]
            g_params['runjob_log'].append(" ".join(cmd))
            begin_time = time.time()
            try:
                rmsg = subprocess.check_output(cmd)
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e)+"\n")
                g_params['runjob_err'].append(rmsg + "\n")
                suqoutfilelist = glob.glob("%s/*.sh.*.out"%(tmpdir))
                if len(suqoutfilelist)>0:
                    suqoutfile = suqoutfilelist[0]
                g_params['runjob_err'].append(myfunc.ReadFile(suqoutfile))
            end_time = time.time()
            runtime_in_sec = end_time - begin_time

            if os.path.exists(tmp_outpath_result):
                cmd = ["cp","-rf", tmp_outpath_result, outpath]
                try:
                    subprocess.check_output(cmd)
                except subprocess.CalledProcessError, e:
                    g_params['runjob_err'].append(str(e))

            if len(g_params['runjob_log']) > 0 :
                rt_msg = myfunc.WriteFile("\n".join(g_params['runjob_log']), runjob_logfile, "a")
                if rt_msg:
                    g_params['runjob_err'].append(rt_msg)

            datetime = time.strftime("%Y-%m-%d %H:%M:%S")
            if os.path.exists(outfile):
                rt_msg = myfunc.WriteFile(datetime, finishtagfile)
                if rt_msg:
                    g_params['runjob_err'].append(rt_msg)

# now write the text output to a single file
            WriteTextResultFile(resultfile_text, maplist, runtime_in_sec)

            # now making zip instead (for windows users)
            pwd = os.getcwd()
            os.chdir(outpath)
#             cmd = ["tar", "-czf", tarball, resultpathname]
            cmd = ["zip", "-rq", zipfile, resultpathname]
            try:
                subprocess.check_output(cmd)
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e))
def RunJob(infile, outpath, tmpdir, email, jobid, g_params):  #{{{
    all_begin_time = time.time()

    rootname = os.path.basename(os.path.splitext(infile)[0])
    starttagfile = "%s/runjob.start" % (outpath)
    runjob_errfile = "%s/runjob.err" % (outpath)
    runjob_logfile = "%s/runjob.log" % (outpath)
    finishtagfile = "%s/runjob.finish" % (outpath)
    rmsg = ""

    resultpathname = jobid

    outpath_result = "%s/%s" % (outpath, resultpathname)
    tarball = "%s.tar.gz" % (resultpathname)
    zipfile = "%s.zip" % (resultpathname)
    tarball_fullpath = "%s.tar.gz" % (outpath_result)
    zipfile_fullpath = "%s.zip" % (outpath_result)
    outfile = "%s/%s/Topcons/topcons.top" % (outpath_result, "seq_%d" % (0))
    resultfile_text = "%s/%s" % (outpath_result, "query.result.txt")
    mapfile = "%s/seqid_index_map.txt" % (outpath_result)
    finished_seq_file = "%s/finished_seqs.txt" % (outpath_result)

    tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname)
    isOK = True
    try:
        os.makedirs(tmp_outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (tmp_outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    try:
        os.makedirs(outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    if isOK:
        try:
            open(finished_seq_file, 'w').close()
        except:
            pass
#first getting result from caches
# ==================================

        maplist = []
        maplist_simple = []
        toRunDict = {}
        hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0)
        if hdl.failure:
            isOK = False
        else:
            datetime = time.strftime("%Y-%m-%d %H:%M:%S")
            rt_msg = myfunc.WriteFile(datetime, starttagfile)

            recordList = hdl.readseq()
            cnt = 0
            origpath = os.getcwd()
            while recordList != None:
                for rd in recordList:
                    isSkip = False
                    # temp outpath for the sequence is always seq_0, and I feed
                    # only one seq a time to the workflow
                    tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result,
                                                      "seq_%d" % 0)
                    outpath_this_seq = "%s/%s" % (outpath_result,
                                                  "seq_%d" % cnt)
                    subfoldername_this_seq = "seq_%d" % (cnt)
                    if os.path.exists(tmp_outpath_this_seq):
                        try:
                            shutil.rmtree(tmp_outpath_this_seq)
                        except OSError:
                            pass

                    maplist.append(
                        "%s\t%d\t%s\t%s" %
                        ("seq_%d" % cnt, len(rd.seq), rd.description, rd.seq))
                    maplist_simple.append(
                        "%s\t%d\t%s" %
                        ("seq_%d" % cnt, len(rd.seq), rd.description))
                    if not g_params['isForceRun']:
                        md5_key = hashlib.md5(rd.seq).hexdigest()
                        subfoldername = md5_key[:2]
                        md5_link = "%s/%s/%s" % (path_md5cache, subfoldername,
                                                 md5_key)
                        if os.path.exists(md5_link):
                            # create a symlink to the cache
                            rela_path = os.path.relpath(
                                md5_link, outpath_result)  #relative path
                            os.chdir(outpath_result)
                            os.symlink(rela_path, subfoldername_this_seq)

                            if os.path.exists(outpath_this_seq):
                                runtime = 0.0  #in seconds
                                topfile = "%s/%s/topcons.top" % (
                                    outpath_this_seq, "Topcons")
                                top = myfunc.ReadFile(topfile).strip()
                                numTM = myfunc.CountTM(top)
                                posSP = myfunc.GetSPPosition(top)
                                if len(posSP) > 0:
                                    isHasSP = True
                                else:
                                    isHasSP = False
                                info_finish = [
                                    "seq_%d" % cnt,
                                    str(len(rd.seq)),
                                    str(numTM),
                                    str(isHasSP), "cached",
                                    str(runtime), rd.description
                                ]
                                myfunc.WriteFile("\t".join(info_finish) + "\n",
                                                 finished_seq_file,
                                                 "a",
                                                 isFlush=True)
                                isSkip = True

                    if not isSkip:
                        # first try to delete the outfolder if exists
                        if os.path.exists(outpath_this_seq):
                            try:
                                shutil.rmtree(outpath_this_seq)
                            except OSError:
                                pass
                        origIndex = cnt
                        numTM = 0
                        toRunDict[origIndex] = [rd.seq, numTM, rd.description
                                                ]  #init value for numTM is 0

                    cnt += 1
                recordList = hdl.readseq()
            hdl.close()
        myfunc.WriteFile("\n".join(maplist_simple) + "\n", mapfile)

        # run scampi single to estimate the number of TM helices and then run
        # the query sequences in the descending order of numTM
        torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa")
        dumplist = []
        for key in toRunDict:
            top = toRunDict[key][0]
            dumplist.append(">%s\n%s" % (str(key), top))
        myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w")
        del dumplist

        topfile_scampiseq = "%s/%s" % (tmp_outpath_result,
                                       "query.torun.fa.topo")
        if os.path.exists(torun_all_seqfile):
            # run scampi to estimate the number of TM helices
            cmd = [
                script_scampi, torun_all_seqfile, "-outpath",
                tmp_outpath_result
            ]
            try:
                rmsg = subprocess.check_output(cmd)
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e) + "\n")
                pass
        if os.path.exists(topfile_scampiseq):
            (idlist_scampi, annolist_scampi,
             toplist_scampi) = myfunc.ReadFasta(topfile_scampiseq)
            for jj in xrange(len(idlist_scampi)):
                numTM = myfunc.CountTM(toplist_scampi[jj])
                try:
                    toRunDict[int(idlist_scampi[jj])][1] = numTM
                except (KeyError, ValueError, TypeError):
                    pass

        sortedlist = sorted(toRunDict.items(),
                            key=lambda x: x[1][1],
                            reverse=True)
        #format of sortedlist [(origIndex: [seq, numTM, description]), ...]

        # submit sequences one by one to the workflow according to orders in
        # sortedlist

        for item in sortedlist:
            #             g_params['runjob_log'].append("tmpdir = %s"%(tmpdir))
            #cmd = [script_getseqlen, infile, "-o", tmp_outfile , "-printid"]
            origIndex = item[0]
            seq = item[1][0]
            description = item[1][2]

            outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % origIndex)
            tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" %
                                              (0))
            if os.path.exists(tmp_outpath_this_seq):
                try:
                    shutil.rmtree(tmp_outpath_this_seq)
                except OSError:
                    pass

            seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" %
                                          (origIndex))
            seqcontent = ">%d\n%s\n" % (origIndex, seq)
            myfunc.WriteFile(seqcontent, seqfile_this_seq, "w")

            if not os.path.exists(seqfile_this_seq):
                g_params['runjob_err'].append(
                    "failed to generate seq index %d" % (origIndex))
                continue

            cmd = [
                runscript, seqfile_this_seq, tmp_outpath_result, blastdir,
                blastdb
            ]
            g_params['runjob_log'].append(" ".join(cmd))
            begin_time = time.time()
            try:
                rmsg = subprocess.check_output(cmd)
                g_params['runjob_log'].append("workflow:\n" + rmsg + "\n")
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e) + "\n")
                g_params['runjob_err'].append(rmsg + "\n")
                pass
                #suqoutfilelist = glob.glob("%s/*.sh.*.out"%(tmpdir))
                #if len(suqoutfilelist)>0:
                #    suqoutfile = suqoutfilelist[0]
                #g_params['runjob_err'].append(myfunc.ReadFile(suqoutfile))
            end_time = time.time()
            runtime_in_sec = end_time - begin_time

            if os.path.exists(tmp_outpath_this_seq):
                cmd = ["mv", "-f", tmp_outpath_this_seq, outpath_this_seq]
                isCmdSuccess = False
                try:
                    subprocess.check_output(cmd)
                    isCmdSuccess = True
                except subprocess.CalledProcessError, e:
                    msg = "Failed to run prediction for sequence No. %d\n" % (
                        origIndex)
                    g_params['runjob_err'].append(msg)
                    g_params['runjob_err'].append(str(e) + "\n")
                    pass
                timefile = "%s/time.txt" % (tmp_outpath_result)
                targetfile = "%s/time.txt" % (outpath_this_seq)
                if os.path.exists(timefile) and os.path.exists(
                        outpath_this_seq):
                    try:
                        shutil.move(timefile, targetfile)
                    except:
                        g_params['runjob_err'].append(
                            "Failed to move %s/time.txt" %
                            (tmp_outpath_result) + "\n")
                        pass

                if isCmdSuccess:
                    runtime = runtime_in_sec  #in seconds
                    topfile = "%s/%s/topcons.top" % (outpath_this_seq,
                                                     "Topcons")
                    top = myfunc.ReadFile(topfile).strip()
                    numTM = myfunc.CountTM(top)
                    posSP = myfunc.GetSPPosition(top)
                    if len(posSP) > 0:
                        isHasSP = True
                    else:
                        isHasSP = False
                    info_finish = [
                        "seq_%d" % origIndex,
                        str(len(seq)),
                        str(numTM),
                        str(isHasSP), "newrun",
                        str(runtime), description
                    ]
                    myfunc.WriteFile("\t".join(info_finish) + "\n",
                                     finished_seq_file,
                                     "a",
                                     isFlush=True)
                    # now write the text output for this seq

                    info_this_seq = "%s\t%d\t%s\t%s" % (
                        "seq_%d" % origIndex, len(seq), description, seq)
                    resultfile_text_this_seq = "%s/%s" % (outpath_this_seq,
                                                          "query.result.txt")
                    myfunc.WriteTOPCONSTextResultFile(resultfile_text_this_seq,
                                                      outpath_result,
                                                      [info_this_seq],
                                                      runtime_in_sec,
                                                      g_params['base_www_url'])
                    # create or update the md5 cache
                    # create cache only on the front-end
                    if g_params['base_www_url'].find("topcons.net") != -1:
                        md5_key = hashlib.md5(seq).hexdigest()
                        subfoldername = md5_key[:2]
                        md5_subfolder = "%s/%s" % (path_md5cache,
                                                   subfoldername)
                        md5_link = "%s/%s/%s" % (path_md5cache, subfoldername,
                                                 md5_key)
                        if os.path.exists(md5_link):
                            try:
                                os.unlink(md5_link)
                            except:
                                pass
                        subfolder_md5 = "%s/%s" % (path_md5cache,
                                                   subfoldername)
                        if not os.path.exists(subfolder_md5):
                            try:
                                os.makedirs(subfolder_md5)
                            except:
                                pass

                        rela_path = os.path.relpath(
                            outpath_this_seq, md5_subfolder)  #relative path
                        try:
                            os.chdir(md5_subfolder)
                            os.symlink(rela_path, md5_key)
                        except:
                            pass
Esempio n. 9
0
def WriteSubconsTextResultFile(
        outfile,
        outpath_result,
        maplist,  #{{{
        runtime_in_sec,
        base_www_url,
        statfile=""):
    try:
        fpout = open(outfile, "w")
        if statfile != "":
            fpstat = open(statfile, "w")

        date_str = time.strftime(FORMAT_DATETIME)
        print >> fpout, "##############################################################################"
        print >> fpout, "Subcons result file"
        print >> fpout, "Generated from %s at %s" % (base_www_url, date_str)
        print >> fpout, "Total request time: %.1f seconds." % (runtime_in_sec)
        print >> fpout, "##############################################################################"
        cnt = 0
        for line in maplist:
            strs = line.split('\t')
            subfoldername = strs[0]
            length = int(strs[1])
            desp = strs[2]
            seq = strs[3]
            seqid = myfunc.GetSeqIDFromAnnotation(desp)
            print >> fpout, "Sequence number: %d" % (cnt + 1)
            print >> fpout, "Sequence name: %s" % (desp)
            print >> fpout, "Sequence length: %d aa." % (length)
            print >> fpout, "Sequence:\n%s\n\n" % (seq)

            rstfile = "%s/%s/%s/query_0_final.csv" % (outpath_result,
                                                      subfoldername, "plot")

            if os.path.exists(rstfile):
                content = myfunc.ReadFile(rstfile).strip()
                lines = content.split("\n")
                if len(lines) >= 6:
                    header_line = lines[0].split("\t")
                    if header_line[0].strip() == "":
                        header_line[0] = "Method"
                        header_line = [x.strip() for x in header_line]

                    data_line = []
                    for i in xrange(1, len(lines)):
                        strs1 = lines[i].split("\t")
                        strs1 = [x.strip() for x in strs1]
                        data_line.append(strs1)

                    content = tabulate.tabulate(data_line, header_line,
                                                'plain')
            else:
                content = ""
            if content == "":
                content = "***No prediction could be produced with this method***"

            print >> fpout, "Prediction results:\n\n%s\n\n" % (content)

            print >> fpout, "##############################################################################"
            cnt += 1

    except IOError:
        print "Failed to write to file %s" % (outfile)
Esempio n. 10
0
def WriteProQ3TextResultFile(
        outfile,
        query_para,
        modelFileList,  #{{{
        runtime_in_sec,
        base_www_url,
        proq3opt,
        statfile=""):
    try:
        fpout = open(outfile, "w")

        try:
            isDeepLearning = query_para['isDeepLearning']
        except KeyError:
            isDeepLearning = True

        if isDeepLearning:
            m_str = "proq3d"
        else:
            m_str = "proq3"

        try:
            method_quality = query_para['method_quality']
        except KeyError:
            method_quality = 'sscore'

        fpstat = None
        numTMPro = 0

        if statfile != "":
            fpstat = open(statfile, "w")
        numModel = len(modelFileList)

        date_str = time.strftime(FORMAT_DATETIME)
        print >> fpout, "##############################################################################"
        print >> fpout, "# ProQ3 result file"
        print >> fpout, "# Generated from %s at %s" % (base_www_url, date_str)
        print >> fpout, "# Options for Proq3: %s" % (str(proq3opt))
        print >> fpout, "# Total request time: %.1f seconds." % (
            runtime_in_sec)
        print >> fpout, "# Number of finished models: %d" % (numModel)
        print >> fpout, "##############################################################################"
        print >> fpout
        print >> fpout, "# Global scores"
        fpout.write("# %10s" % ("Model"))

        cnt = 0
        for i in xrange(numModel):
            modelfile = modelFileList[i]
            globalscorefile = "%s.%s.%s.global" % (modelfile, m_str,
                                                   method_quality)
            if not os.path.exists(globalscorefile):
                globalscorefile = "%s.proq3.%s.global" % (modelfile,
                                                          method_quality)
                if not os.path.exists(globalscorefile):
                    globalscorefile = "%s.proq3.global" % (modelfile)
            (globalscore, itemList) = ReadProQ3GlobalScore(globalscorefile)
            if i == 0:
                for ss in itemList:
                    fpout.write(" %12s" % (ss))
                fpout.write("\n")

            try:
                if globalscore:
                    fpout.write("%2s %10s" % ("", "model_%d" % (i)))
                    for jj in xrange(len(itemList)):
                        fpout.write(" %12f" % (globalscore[itemList[jj]]))
                    fpout.write("\n")
                else:
                    print >> fpout, "%2s %10s" % ("", "model_%d" % (i))
            except:
                pass

        print >> fpout, "\n# Local scores"
        for i in xrange(numModel):
            modelfile = modelFileList[i]
            localscorefile = "%s.%s.%s.local" % (modelfile, m_str,
                                                 method_quality)
            if not os.path.exists(localscorefile):
                localscorefile = "%s.proq3.%s.local" % (modelfile,
                                                        method_quality)
                if not os.path.exists(localscorefile):
                    localscorefile = "%s.proq3.local" % (modelfile)
            print >> fpout, "\n# Model %d" % (i)
            content = myfunc.ReadFile(localscorefile)
            print >> fpout, content

    except IOError:
        print "Failed to write to file %s" % (outfile)
Esempio n. 11
0
def RunJob(modelfile, seqfile, outpath, tmpdir, email, jobid, g_params):  #{{{
    all_begin_time = time.time()

    rootname = os.path.basename(os.path.splitext(modelfile)[0])
    starttagfile = "%s/runjob.start" % (outpath)
    runjob_errfile = "%s/runjob.err" % (outpath)
    runjob_logfile = "%s/runjob.log" % (outpath)
    finishtagfile = "%s/runjob.finish" % (outpath)
    rmsg = ""

    query_parafile = "%s/query.para.txt" % (outpath)
    query_para = {}
    content = myfunc.ReadFile(query_parafile)
    if content != "":
        query_para = json.loads(content)

    resultpathname = jobid

    outpath_result = "%s/%s" % (outpath, resultpathname)
    tarball = "%s.tar.gz" % (resultpathname)
    zipfile = "%s.zip" % (resultpathname)
    tarball_fullpath = "%s.tar.gz" % (outpath_result)
    zipfile_fullpath = "%s.zip" % (outpath_result)
    mapfile = "%s/seqid_index_map.txt" % (outpath_result)
    finished_model_file = "%s/finished_models.txt" % (outpath_result)
    timefile = "%s/time.txt" % (outpath_result)

    tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname)
    isOK = True
    if os.path.exists(tmp_outpath_result):
        shutil.rmtree(tmp_outpath_result)
    try:
        os.makedirs(tmp_outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (tmp_outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    if os.path.exists(outpath_result):
        shutil.rmtree(outpath_result)
    try:
        os.makedirs(outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    if isOK:
        try:
            open(finished_model_file, 'w').close()
        except:
            pass
#first getting result from caches
# cache profiles for sequences, but do not cache predictions for models
        webserver_common.WriteDateTimeTagFile(starttagfile, runjob_logfile,
                                              runjob_errfile)
        # ==================================
        numModel = 0
        modelFileList = []
        if seqfile != "":  # if the fasta sequence is supplied, all models should be using this sequence
            subfoldername_profile = "profile_%d" % (0)
            outpath_profile = "%s/%s" % (outpath_result, subfoldername_profile)
            CreateProfile(seqfile, outpath_profile, outpath_result,
                          tmp_outpath_result, timefile, runjob_errfile)

            # run proq3 for models
            modelList = myfunc.ReadPDBModel(modelfile)
            numModel = len(modelList)
            for ii in xrange(len(modelList)):
                model = modelList[ii]
                tmp_model_file = "%s/query_%d.pdb" % (tmp_outpath_result, ii)
                myfunc.WriteFile(model + "\n", tmp_model_file)
                profilename = "%s/%s" % (outpath_profile, "query.fasta")
                subfoldername_this_model = "model_%d" % (ii)
                outpath_this_model = "%s/%s" % (outpath_result,
                                                subfoldername_this_model)

                modelinfo = ScoreModel(query_para, tmp_model_file,
                                       outpath_this_model, profilename,
                                       outpath_result, tmp_outpath_result,
                                       timefile, runjob_errfile)
                myfunc.WriteFile("\t".join(modelinfo) + "\n",
                                 finished_model_file, "a")
                modelFileList.append(
                    "%s/%s" % (outpath_this_model, "query_%d.pdb" % (ii)))

        else:  # no seqfile supplied, sequences are obtained from the model file
            modelList = myfunc.ReadPDBModel(modelfile)
            numModel = len(modelList)
            for ii in xrange(len(modelList)):
                model = modelList[ii]
                tmp_model_file = "%s/query_%d.pdb" % (tmp_outpath_result, ii)
                myfunc.WriteFile(model + "\n", tmp_model_file)
                subfoldername_this_model = "model_%d" % (ii)
                tmp_outpath_this_model = "%s/%s" % (tmp_outpath_result,
                                                    subfoldername_this_model)
                if not os.path.exists(tmp_outpath_this_model):
                    os.makedirs(tmp_outpath_this_model)
                tmp_seqfile = "%s/query.fasta" % (tmp_outpath_this_model)
                cmd = [pdb2aa_script, tmp_model_file]
                g_params['runjob_log'].append(" ".join(cmd))
                try:
                    rmsg = subprocess.check_output(cmd)
                    g_params['runjob_log'].append(
                        "extracting sequence from modelfile:\n" + rmsg + "\n")
                except subprocess.CalledProcessError, e:
                    g_params['runjob_err'].append(str(e) + "\n")
                    g_params['runjob_err'].append(rmsg + "\n")

                if rmsg != "":
                    myfunc.WriteFile(">seq\n" + rmsg.strip(), tmp_seqfile)

                subfoldername_profile = "profile_%d" % (ii)
                outpath_profile = "%s/%s" % (outpath_result,
                                             subfoldername_profile)
                CreateProfile(tmp_seqfile, outpath_profile, outpath_result,
                              tmp_outpath_result, timefile, runjob_errfile)

                outpath_this_model = "%s/%s" % (outpath_result,
                                                subfoldername_this_model)
                profilename = "%s/%s" % (outpath_profile, "query.fasta")
                modelinfo = ScoreModel(query_para, tmp_model_file,
                                       outpath_this_model, profilename,
                                       outpath_result, tmp_outpath_result,
                                       timefile, runjob_errfile)
                myfunc.WriteFile("\t".join(modelinfo) + "\n",
                                 finished_model_file, "a")
                modelFileList.append(
                    "%s/%s" % (outpath_this_model, "query_%d.pdb" % (ii)))

        all_end_time = time.time()
        all_runtime_in_sec = all_end_time - all_begin_time

        if len(g_params['runjob_log']) > 0:
            rt_msg = myfunc.WriteFile("\n".join(g_params['runjob_log']) + "\n",
                                      runjob_logfile, "a")
            if rt_msg:
                g_params['runjob_err'].append(rt_msg)

        webserver_common.WriteDateTimeTagFile(finishtagfile, runjob_logfile,
                                              runjob_errfile)
        # now write the text output to a single file
        #statfile = "%s/%s"%(outpath_result, "stat.txt")
        statfile = ""
        dumped_resultfile = "%s/%s" % (outpath_result, "query.proq3.txt")
        proq3opt = GetProQ3Option(query_para)
        webserver_common.WriteProQ3TextResultFile(dumped_resultfile,
                                                  query_para,
                                                  modelFileList,
                                                  all_runtime_in_sec,
                                                  g_params['base_www_url'],
                                                  proq3opt,
                                                  statfile=statfile)

        # now making zip instead (for windows users)
        # note that zip rq will zip the real data for symbolic links
        os.chdir(outpath)
        #             cmd = ["tar", "-czf", tarball, resultpathname]
        cmd = ["zip", "-rq", zipfile, resultpathname]
        try:
            subprocess.check_output(cmd)
        except subprocess.CalledProcessError, e:
            g_params['runjob_err'].append(str(e))
            pass
Esempio n. 12
0
def DumpPredictionTOPCONS2(seqfile, path_result, outfile, isWriteDG,
                           isWriteRel):  #{{{
    (seqidlist, seqannolist, seqlist) = myfunc.ReadFasta(seqfile)
    outfile_fa = "%s.fa" % (outfile)
    outfile_unfinished_fa = "%s.unfinished.fa" % (outfile)
    numseq = len(seqidlist)

    fpout = None
    try:
        fpout = open(outfile, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile)
        return 1

    fpout_fa = None
    try:
        fpout_fa = open(outfile_fa, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile_fa)
        return 1

    fpout_unfinished_fa = None
    try:
        fpout_unfinished_fa = open(outfile_unfinished_fa, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (
            outfile_unfinished_fa)
        return 1

    methodlist = [
        'TOPCONS', 'OCTOPUS', 'Philius', 'PolyPhobius', 'SCAMPI', 'SPOCTOPUS',
        'Homology'
    ]

    cntUnFinished = 0
    for iseq in xrange(len(seqidlist)):
        seq = seqlist[iseq]
        length = len(seq)
        desp = seqannolist[iseq]
        if g_params['resultPathFormat'] == "md5":
            md5_key2 = hashlib.md5(seq + "\n").hexdigest()
            md5_key1 = hashlib.md5(seq).hexdigest()
            subdirname = "seq_%d" % (0)
            isFound = False
            for md5_key in [md5_key1, md5_key2]:
                dir1 = md5_key[:2]
                dir2 = md5_key[2:4]
                datapath_this_seq = "%s%s%s%s%s%s%s" % (
                    path_result, os.sep, dir1, os.sep, dir2, os.sep, md5_key)
                subdir = "%s/%s" % (datapath_this_seq, subdirname)
                if os.path.exists(subdir):
                    break
        else:
            subdirname = "seq_%d" % (iseq)
            subdir = "%s/%s" % (path_result, subdirname)

        if g_params['verbose']:
            print "subdir = %s" % (subdir)

        rstfile = "%s/Topcons/topcons.top" % (subdir)
        if os.path.exists(rstfile):
            print >> fpout, "Sequence number: %d" % (iseq + 1)
            print >> fpout, "Sequence name: %s" % (desp)
            print >> fpout, "Sequence length: %d aa." % (length)
            print >> fpout, "Sequence:\n%s\n\n" % (seq)
            topo_consensus = ""
            for i in xrange(len(methodlist)):
                method = methodlist[i]
                seqid = ""
                seqanno = ""
                top = ""
                if method == "TOPCONS":
                    topfile = "%s/%s/topcons.top" % (subdir, "Topcons")
                elif method == "Philius":
                    topfile = "%s/%s/query.top" % (subdir, "philius")
                elif method == "SCAMPI":
                    topfile = "%s/%s/query.top" % (subdir, method + "_MSA")
                else:
                    topfile = "%s/%s/query.top" % (subdir, method)
                if os.path.exists(topfile):
                    (seqid, seqanno, top) = myfunc.ReadSingleFasta(topfile)
                else:
                    top = ""
                if top == "":
                    #top = "***No topology could be produced with this method topfile=%s***"%(topfile)
                    top = "***No topology could be produced with this method***"

                if method == "TOPCONS":
                    topo_consensus = top

                if method == "Homology":
                    showtext_homo = method
                    if seqid != "":
                        showtext_homo = seqid
                    print >> fpout, "%s:\n%s\n\n" % (showtext_homo, top)
                else:
                    print >> fpout, "%s predicted topology:\n%s\n\n" % (method,
                                                                        top)

            if isWriteDG:
                dgfile = "%s/dg.txt" % (subdir)
                dg_content = ""
                if os.path.exists(dgfile):
                    dg_content = myfunc.ReadFile(dgfile)
                lines = dg_content.split("\n")
                dglines = []
                for line in lines:
                    if line and line[0].isdigit():
                        dglines.append(line)
                if len(dglines) > 0:
                    print >> fpout,  "\nPredicted Delta-G-values (kcal/mol) "\
                            "(left column=sequence position; right column=Delta-G)\n"
                    print >> fpout, "\n".join(dglines)

            if isWriteRel:
                reliability_file = "%s/Topcons/reliability.txt" % (subdir)
                reliability = ""
                if os.path.exists(reliability_file):
                    reliability = myfunc.ReadFile(reliability_file)
                if reliability != "":
                    print >> fpout, "\nPredicted TOPCONS reliability (left "\
                            "column=sequence position; right column=reliability)\n"
                    print >> fpout, reliability

            print >> fpout, "##############################################################################"

            # write the concensus prediction in FASTA format
            print >> fpout_fa, ">%s" % (desp)
            print >> fpout_fa, topo_consensus

        else:
            # write unfinished
            fpout_unfinished_fa.write(">%s\n%s\n" % (desp, seq))
            cntUnFinished += 1

    if cntUnFinished > 1:
        print >> sys.stderr, "%s out of %d sequences are with unfinished predictions, please check." % (
            cntUnFinished, numseq)

    for fp in [fpout, fpout_fa, fpout_unfinished_fa]:
        if fp:
            try:
                fp.close()
            except IOError:
                pass

    return 0
def SubmitJobToQueue(
        jobid,
        datapath,
        outpath,
        nummodel,
        nummodel_this_user,
        email,  #{{{
        host_ip,
        base_www_url):
    myfunc.WriteFile("Entering SubmitJobToQueue()\n", g_params['debugfile'],
                     "a")
    modelfile = "%s/query.pdb" % (datapath)
    seqfile = "%s/query.fa" % (datapath)

    if nummodel == -1:
        nummodel = myfunc.ReadFile(modelfile).count("\nENDMDL")
        if nummodel == 0:
            nummodel = 1
    if nummodel_this_user == -1:
        nummodel_this_user = nummodel

    query_parafile = "%s/query.para.txt" % (outpath)

    query_para = {}
    content = myfunc.ReadFile(query_parafile)
    para_str = content
    if content != "":
        query_para = json.loads(content)

    try:
        name_software = query_para['name_software']
    except KeyError:
        name_software = "proq3"

    runjob = "%s %s/run_job.py" % (python_exec, rundir)
    scriptfile = "%s/runjob,%s,%s,%s,%s,%d.sh" % (
        outpath, name_software, jobid, host_ip, email, nummodel)
    code_str_list = []
    code_str_list.append("#!/bin/bash")
    code_str_list.append("source %s/bin/activate" % (virt_env_path))
    cmdline = "%s %s -outpath %s -tmpdir %s -jobid %s " % (
        runjob, modelfile, outpath, datapath, jobid)
    if email != "":
        cmdline += "-email \"%s\" " % (email)
    if os.path.exists(seqfile):
        cmdline += "-fasta \"%s\" " % (seqfile)
    if base_www_url != "":
        cmdline += "-baseurl \"%s\" " % (base_www_url)
    if g_params['isForceRun']:
        cmdline += "-force "
    code_str_list.append(cmdline)

    code = "\n".join(code_str_list)

    msg = "Write scriptfile %s" % (scriptfile)
    myfunc.WriteFile(msg + "\n", g_params['debugfile'], "a")

    myfunc.WriteFile(code, scriptfile)
    os.chmod(scriptfile, 0755)

    myfunc.WriteFile("Getting priority" + "\n", g_params['debugfile'], "a")
    priority = myfunc.GetSuqPriority(nummodel_this_user)

    if email in vip_user_list:
        priority = 999999999.0

    myfunc.WriteFile("priority=%d\n" % (priority), g_params['debugfile'], "a")

    st1 = SubmitSuqJob(suq_basedir, datapath, outpath, priority, scriptfile)

    return st1
Esempio n. 14
0
def RunJob(modelfile, seqfile, outpath, tmpdir, email, jobid, g_params):  #{{{
    all_begin_time = time.time()

    rootname = os.path.basename(os.path.splitext(modelfile)[0])
    starttagfile = "%s/runjob.start" % (outpath)
    runjob_errfile = "%s/runjob.err" % (outpath)
    runjob_logfile = "%s/runjob.log" % (outpath)
    finishtagfile = "%s/runjob.finish" % (outpath)
    rmsg = ""

    query_parafile = "%s/query.para.txt" % (outpath)
    query_para = {}
    content = myfunc.ReadFile(query_parafile)
    if content != "":
        query_para = json.loads(content)

    resultpathname = jobid

    outpath_result = "%s/%s" % (outpath, resultpathname)
    tarball = "%s.tar.gz" % (resultpathname)
    zipfile = "%s.zip" % (resultpathname)
    tarball_fullpath = "%s.tar.gz" % (outpath_result)
    zipfile_fullpath = "%s.zip" % (outpath_result)
    mapfile = "%s/seqid_index_map.txt" % (outpath_result)
    finished_model_file = "%s/finished_models.txt" % (outpath_result)
    timefile = "%s/time.txt" % (outpath_result)

    tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname)
    isOK = True
    if os.path.exists(tmp_outpath_result):
        shutil.rmtree(tmp_outpath_result)
    try:
        os.makedirs(tmp_outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (tmp_outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    if os.path.exists(outpath_result):
        shutil.rmtree(outpath_result)
    try:
        os.makedirs(outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    if isOK:
        try:
            open(finished_model_file, 'w').close()
        except:
            pass
#first getting result from caches
# cache profiles for sequences, but do not cache predictions for models
        webserver_common.WriteDateTimeTagFile(starttagfile, runjob_logfile,
                                              runjob_errfile)
        # ==================================
        numModel = 0
        modelFileList = []
        if seqfile != "":  # if the fasta sequence is supplied, all models should be using this sequence
            subfoldername_profile = "profile_%d" % (0)
            outpath_profile = "%s/%s" % (outpath_result, subfoldername_profile)
            CreateProfile(seqfile, outpath_profile, outpath_result,
                          tmp_outpath_result, timefile, runjob_errfile)

            # run proq3 for models
            modelList = myfunc.ReadPDBModel(modelfile)
            numModel = len(modelList)
            for ii in range(len(modelList)):
                model = modelList[ii]
                tmp_model_file = "%s/query_%d.pdb" % (tmp_outpath_result, ii)
                myfunc.WriteFile(model + "\n", tmp_model_file)
                profilename = "%s/%s" % (outpath_profile, "query.fasta")
                subfoldername_this_model = "model_%d" % (ii)
                outpath_this_model = "%s/%s" % (outpath_result,
                                                subfoldername_this_model)

                modelinfo = ScoreModel(query_para, tmp_model_file,
                                       outpath_this_model, profilename,
                                       outpath_result, tmp_outpath_result,
                                       timefile, runjob_errfile)
                myfunc.WriteFile("\t".join(modelinfo) + "\n",
                                 finished_model_file, "a")
                modelFileList.append(
                    "%s/%s" % (outpath_this_model, "query_%d.pdb" % (ii)))

        else:  # no seqfile supplied, sequences are obtained from the model file
            modelList = myfunc.ReadPDBModel(modelfile)
            numModel = len(modelList)
            for ii in range(len(modelList)):
                model = modelList[ii]
                tmp_model_file = "%s/query_%d.pdb" % (tmp_outpath_result, ii)
                myfunc.WriteFile(model + "\n", tmp_model_file)
                subfoldername_this_model = "model_%d" % (ii)
                tmp_outpath_this_model = "%s/%s" % (tmp_outpath_result,
                                                    subfoldername_this_model)
                if not os.path.exists(tmp_outpath_this_model):
                    os.makedirs(tmp_outpath_this_model)
                tmp_seqfile = "%s/query.fasta" % (tmp_outpath_this_model)
                cmd = [pdb2aa_script, tmp_model_file]
                g_params['runjob_log'].append(" ".join(cmd))
                try:
                    rmsg = subprocess.check_output(cmd)
                    g_params['runjob_log'].append(
                        "extracting sequence from modelfile:\n" + rmsg + "\n")
                except subprocess.CalledProcessError as e:
                    g_params['runjob_err'].append(str(e) + "\n")
                    g_params['runjob_err'].append(rmsg + "\n")

                if rmsg != "":
                    myfunc.WriteFile(">seq\n" + rmsg.strip(), tmp_seqfile)

                subfoldername_profile = "profile_%d" % (ii)
                outpath_profile = "%s/%s" % (outpath_result,
                                             subfoldername_profile)
                CreateProfile(tmp_seqfile, outpath_profile, outpath_result,
                              tmp_outpath_result, timefile, runjob_errfile)

                outpath_this_model = "%s/%s" % (outpath_result,
                                                subfoldername_this_model)
                profilename = "%s/%s" % (outpath_profile, "query.fasta")
                modelinfo = ScoreModel(query_para, tmp_model_file,
                                       outpath_this_model, profilename,
                                       outpath_result, tmp_outpath_result,
                                       timefile, runjob_errfile)
                myfunc.WriteFile("\t".join(modelinfo) + "\n",
                                 finished_model_file, "a")
                modelFileList.append(
                    "%s/%s" % (outpath_this_model, "query_%d.pdb" % (ii)))

        all_end_time = time.time()
        all_runtime_in_sec = all_end_time - all_begin_time

        if len(g_params['runjob_log']) > 0:
            rt_msg = myfunc.WriteFile("\n".join(g_params['runjob_log']) + "\n",
                                      runjob_logfile, "a")
            if rt_msg:
                g_params['runjob_err'].append(rt_msg)

        webserver_common.WriteDateTimeTagFile(finishtagfile, runjob_logfile,
                                              runjob_errfile)
        # now write the text output to a single file
        #statfile = "%s/%s"%(outpath_result, "stat.txt")
        statfile = ""
        dumped_resultfile = "%s/%s" % (outpath_result, "query.proq3.txt")
        proq3opt = GetProQ3Option(query_para)
        webserver_common.WriteProQ3TextResultFile(dumped_resultfile,
                                                  query_para,
                                                  modelFileList,
                                                  all_runtime_in_sec,
                                                  g_params['base_www_url'],
                                                  proq3opt,
                                                  statfile=statfile)

        # now making zip instead (for windows users)
        # note that zip rq will zip the real data for symbolic links
        os.chdir(outpath)
        #             cmd = ["tar", "-czf", tarball, resultpathname]
        cmd = ["zip", "-rq", zipfile, resultpathname]
        try:
            subprocess.check_output(cmd)
        except subprocess.CalledProcessError as e:
            g_params['runjob_err'].append(str(e))
            pass

    isSuccess = False
    if (os.path.exists(finishtagfile) and os.path.exists(zipfile_fullpath)):
        isSuccess = True
        flist = glob.glob("%s/*.out" % (tmpdir))
        if len(flist) > 0:
            outfile_runscript = flist[0]
        else:
            outfile_runscript = ""
        if os.path.exists(outfile_runscript):
            shutil.move(outfile_runscript, outpath)
        # delete the tmpdir if succeeded
        shutil.rmtree(tmpdir)  #DEBUG, keep tmpdir
    else:
        isSuccess = False
        failedtagfile = "%s/runjob.failed" % (outpath)
        webserver_common.WriteDateTimeTagFile(failedtagfile, runjob_logfile,
                                              runjob_errfile)

# send the result to email
# do not sendmail at the cloud VM
    if (webserver_common.IsFrontEndNode(g_params['base_www_url'])
            and myfunc.IsValidEmailAddress(email)):
        from_email = "*****@*****.**"
        to_email = email
        subject = "Your result for ProQ3 JOBID=%s" % (jobid)
        if isSuccess:
            bodytext = """
Your result is ready at %s/pred/result/%s

Thanks for using ProQ3

        """ % (g_params['base_www_url'], jobid)
        else:
            bodytext = """
We are sorry that your job with jobid %s is failed.

Please contact %s if you have any questions.

Attached below is the error message:
%s
            """ % (jobid, contact_email, "\n".join(g_params['runjob_err']))
        g_params['runjob_log'].append("Sendmail %s -> %s, %s" %
                                      (from_email, to_email, subject))  #debug
        rtValue = myfunc.Sendmail(from_email, to_email, subject, bodytext)
        if rtValue != 0:
            g_params['runjob_err'].append(
                "Sendmail to {} failed with status {}".format(
                    to_email, rtValue))

    if len(g_params['runjob_err']) > 0:
        rt_msg = myfunc.WriteFile("\n".join(g_params['runjob_err']) + "\n",
                                  runjob_errfile, "w")
        return 1
    return 0
Esempio n. 15
0
def DumpPredictionTOPCONS2(seqfile, path_result, outfile, isWriteDG,
                           isWriteRel):  #{{{
    (seqidlist, seqannolist, seqlist) = myfunc.ReadFasta(seqfile)
    outfile_fa = "%s.fa" % (outfile)

    fpout = None
    try:
        fpout = open(outfile, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile)
        return 1

    fpout_fa = None
    try:
        fpout_fa = open(outfile_fa, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile_fa)
        return 1

    methodlist = [
        'TOPCONS', 'OCTOPUS', 'Philius', 'PolyPhobius', 'SCAMPI', 'SPOCTOPUS',
        'Homology'
    ]

    for i in xrange(len(seqidlist)):
        subdirname = "seq_%d" % (i)
        subdir = "%s/%s" % (path_result, subdirname)
        seq = seqlist[i]
        length = len(seq)
        desp = seqannolist[i]
        print >> fpout, "Sequence number: %d" % (i + 1)
        print >> fpout, "Sequence name: %s" % (desp)
        print >> fpout, "Sequence length: %d aa." % (length)
        print >> fpout, "Sequence:\n%s\n\n" % (seq)
        topo_consensus = ""
        for i in xrange(len(methodlist)):
            method = methodlist[i]
            seqid = ""
            seqanno = ""
            top = ""
            if method == "TOPCONS":
                topfile = "%s/%s/topcons.top" % (subdir, "Topcons")
            elif method == "Philius":
                topfile = "%s/%s/query.top" % (subdir, "philius")
            elif method == "SCAMPI":
                topfile = "%s/%s/query.top" % (subdir, method + "_MSA")
            else:
                topfile = "%s/%s/query.top" % (subdir, method)
            if os.path.exists(topfile):
                (seqid, seqanno, top) = myfunc.ReadSingleFasta(topfile)
            else:
                top = ""
            if top == "":
                #top = "***No topology could be produced with this method topfile=%s***"%(topfile)
                top = "***No topology could be produced with this method***"

            if method == "TOPCONS":
                topo_consensus = top

            if method == "Homology":
                showtext_homo = method
                if seqid != "":
                    showtext_homo = seqid
                print >> fpout, "%s:\n%s\n\n" % (showtext_homo, top)
            else:
                print >> fpout, "%s predicted topology:\n%s\n\n" % (method,
                                                                    top)

        if isWriteDG:
            dgfile = "%s/dg.txt" % (subdir)
            dg_content = ""
            if os.path.exists(dgfile):
                dg_content = myfunc.ReadFile(dgfile)
            lines = dg_content.split("\n")
            dglines = []
            for line in lines:
                if line and line[0].isdigit():
                    dglines.append(line)
            if len(dglines) > 0:
                print >> fpout,  "\nPredicted Delta-G-values (kcal/mol) "\
                        "(left column=sequence position; right column=Delta-G)\n"
                print >> fpout, "\n".join(dglines)

        if isWriteRel:
            reliability_file = "%s/Topcons/reliability.txt" % (subdir)
            reliability = ""
            if os.path.exists(reliability_file):
                reliability = myfunc.ReadFile(reliability_file)
            if reliability != "":
                print >> fpout, "\nPredicted TOPCONS reliability (left "\
                        "column=sequence position; right column=reliability)\n"
                print >> fpout, reliability

        print >> fpout, "##############################################################################"

        # write the concensus prediction in FASTA format
        print >> fpout_fa, ">%s" % (desp)
        print >> fpout_fa, topo_consensus

    if fpout:
        try:
            fpout.close()
        except IOError:
            pass
    if fpout_fa:
        try:
            fpout_fa.close()
        except IOError:
            pass

    return 0