def DrawSeqMSA(seqmsafile, outpath):
    print "Remove gaps from sequence"
    (idList, annotationList, seqList) = myfunc.ReadFasta(seqmsafile)
    rootname = os.path.basename(os.path.splitext(seqmsafile)[0])
    basename = os.path.basename(seqmsafile)
    seqfile = outpath + os.sep + rootname + '.fa'
    fpout = open(seqfile, "w")
    for i in xrange(len(idList)):
        fpout.write(">%s\n" % annotationList[i])
        fpout.write("%s\n" % seqList[i].replace("-", "").replace(".", ""))
    fpout.close()

    print "Predicting topologies..."
    scampi_exe = "%s/mySCAMPI_run.pl" % g_params['newscampiscriptpath']
    scampi_dir = g_params['scampi_dir']
    modhmm_bin = g_params['modhmm_bin']
    cmd = "%s %s --scampipath %s --modhmmpath %s --outpath %s" % (
        scampi_exe, seqfile, scampi_dir, modhmm_bin, outpath)
    os.system(cmd)
    os.system("rm -f %s/*.res" % outpath)

    print "Get topomsa"
    binpath = g_params['binpath']
    topofile = outpath + os.sep + rootname + '.fa.topo'
    topomsafile = outpath + os.sep + rootname + '.topomsa.fa'
    cmd = "%s/matchMSAtopo -msa %s -topo %s -o %s" % (binpath, seqmsafile,
                                                      topofile, topomsafile)
    os.system(cmd)

    print "Draw topomsa"
    cmd = "python %s/drawMSATopo.py %s -text y -outpath %s -aaseq %s" % (
        binpath, topomsafile, outpath, seqfile)
    os.system(cmd)
Example #2
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    topofile = ""
    outfile = ""
    isGapLess = False

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            topofile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o"]:
                outfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-i", "--i"]:
                topofile = argv[i + 1]
                i += 2
            elif argv[i] in ["-gapless", "--gapless"]:
                isGapLess = True
                i += 1
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            topofile = argv[i]
            i += 1
    if topofile == "":
        print >> sys.stderr, "topofile not set. exit"
        return 1
    try:
        (idList, annoList, seqList) = myfunc.ReadFasta(topofile)
        fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
        for i in xrange(len(idList)):
            topo = seqList[i]
            seqid = idList[i]
            if isGapLess:
                topo = topo.replace("-", "").replace(".", "")
            posTMList = myfunc.GetTMPosition(topo)
            print >> fpout, seqid, posTMList
        myfunc.myclose(fpout)
    except (IOError, IndexError):
        pass
Example #3
0
def WriteSeqAlnHTML(seqAlnFileList, extTopoMSA, outfile):  # {{{
    try:
        fpout = open(outfile, "w")
    except IOError:
        print("Failed to write to %s" % (outfile), file=sys.stderr)
        return 1
    WriteHTMLHeader(
        'Alignment highlighted by <font color=%s>TM regions</font>' % ('red'),
        fpout)
    print("Processed alignments:")
    for alnfile in seqAlnFileList:
        rootname_alnfile = os.path.basename(os.path.splitext(alnfile)[0])
        topomsafile = '.'.join([os.path.splitext(alnfile)[0], extTopoMSA])
        if not (os.path.exists(alnfile) and os.path.exists(topomsafile)):
            if not os.path.exists(alnfile):
                sys.stderr.write('alnfile %s does not exist\n' % (alnfile))
            if not os.path.exists(topomsafile):
                sys.stderr.write('topomsafile %s does not exist\n' %
                                 (topomsafile))
            continue
        (seqIDList, seqAnnoList, seqList) = myfunc.ReadFasta(alnfile)
        #print(seqIDList)
        (topoIDList, topoAnnoList, topoList) = myfunc.ReadFasta(topomsafile)
        #print(topoIDList)
        if g_params['removeUnnecessaryGap']:
            seqList = lcmp.RemoveUnnecessaryGap(seqList)
            topoList = lcmp.RemoveUnnecessaryGap(topoList)

        # since there is no shrinking, index map is always p->p
        final2seq_idxMapList = []
        for i in range(len(seqIDList)):
            seqlength = len(seqList[i])
            idxmap = {}
            for j in range(seqlength):
                idxmap[j] = j
            final2seq_idxMapList.append(idxmap)

        print(('\t' + rootname_alnfile))
        WriteHTMLAlignment2(rootname_alnfile, topoIDList, topoAnnoList,
                            topoList, topoList, seqList, final2seq_idxMapList,
                            fpout)

    WriteHTMLTail(fpout)

    fpout.close()
    return 0
def MatchTopoPairAln(queryTopoFile,alignFile, targetsTopologyFile, fpout):#{{{
#     fptmp=open(queryTopoFile);
#     print fptmp.readlines();
#     fptmp.close();
    try:
        (queryID, queryAnnotation, queryTopology) = myfunc.ReadSingleFasta(queryTopoFile);
        # read in alignment
        alns = ReadNeedleAlignment(alignFile);

        # read in topologys
        (targetIDList, targetAnnotationList, targetTopoList) = myfunc.ReadFasta(targetsTopologyFile);

        # match and print the result
        print >> fpout, "#Number of alignments: %d" % len(targetIDList);

        for i in range (len(targetIDList)):
            seqID=targetIDList[i];
            alnseq1=alns[i]['alnseq1'];
            alnseq2=alns[i]['alnseq2'];
            topoaln1="";
            topoaln2="";

            if seqID != alns[i]['seqid2']:
                print >> sys.stderr, "seqID does not match, record %d" %i;

            cnt1=0;
            cnt2=0;
            for j in range(len(alnseq1)):
                if alnseq1[j] != '-':
                    if alnseq2[j] != '-':
                        topoaln1+=queryTopology[cnt1];
                        topoaln2+=targetTopoList[i][cnt2];
                    else:
                        topoaln1+=queryTopology[cnt1];
                        topoaln2+='-';
                else:
                    if alnseq2[j] != '-':
                        topoaln1+='-';
                        topoaln2+=targetTopoList[i][cnt2];
                    else:
                        topoaln1+='-';
                        topoaln2+='-';
                if alnseq1[j] != '-':
                    cnt1 +=1;
                if alnseq2[j] != '-':
                    cnt2 += 1;
            #print the result
            print >> fpout, "#Topology alignment %d" %( i+1);
            print >> fpout, ">%s" % queryAnnotation;
            print >> fpout, "%s" % topoaln1;
            print >> fpout, ">%s" % targetAnnotationList[i];
            print >> fpout, "%s" % topoaln2;
            print >> fpout;
    except: 
        print >>sys.stderr, "except for the function:%s"%sys._getframe().f_code.co_name ;
        raise ;
    return 0;
def GetPairTopoAln(pairalnTopoFile):#{{{
    (idList, annoList, seqList) = myfunc.ReadFasta(pairalnTopoFile);
    numPair = len(idList)/2;
    pairTopoAlnDict = {};
    for i in xrange(numPair):
        pair = {};
        pair['id1'] = idList[i*2];
        pair['id2'] = idList[i*2+1];
        pair['anno1'] = annoList[i*2];
        pair['anno2'] = annoList[i*2+1];
        pair['seq1'] = seqList[i*2];
        pair['seq2'] = seqList[i*2+1];
        key = "%s-%s"%(idList[i*2], idList[i*2+1]);
        pairTopoAlnDict[key] = pair;
    return pairTopoAlnDict;
Example #6
0
def RandFasta(inFile, N, rand_seed, fpout):  #{{{
    (idList, annotationList, seqList) = myfunc.ReadFasta(inFile, BLOCK_SIZE)
    if idList == None:
        print("Failed to read fastafile %s. Exit." % inFile, file=sys.stderr)
        return -1
    random.seed(rand_seed)
    Nseq = len(idList)
    if N > Nseq:
        N = Nseq
    idxArray = list(range(Nseq))
    idxSample = random.sample(idxArray, N)
    for i in range(N):
        idx = idxSample[i]
        fpout.write(">%s\n" % annotationList[idx])
        fpout.write("%s\n" % seqList[idx])
    return 0
def action(method, alnfile, outfile):
    (seqidList, seqAnnoList, seqList) = myfunc.ReadFasta(alnfile)
    if (method == 0):
        newSeqList = lcmp.RemoveUnnecessaryGap_old(seqList)
    else:
        newSeqList = lcmp.RemoveUnnecessaryGap(seqList)
    try:
        if outfile == "":
            fpout = sys.stdout
        else:
            fpout = open(outfile, "w")
        for i in range(len(seqidList)):
            fpout.write(">%s\n" % (seqAnnoList[i]))
            fpout.write("%s\n" % (newSeqList[i]))
        if fpout and fpout != sys.stdout:
            fpout.close()
        return 0
    except IOError:
        click.echo("Failed to write to file %s" % (outfile))
        return 1
Example #8
0
def AddPairwiseAlignmentFactor(pairlistDict, msapath, msaext, #{{{
        isLocalAlignment):
    cntfamid = 0
    verbose = g_params['verbose']
    for famid in pairlistDict:
        cntfamid += 1
        if verbose >= 2:
            print "Add pairwise alignment factor for %d: %s"%(cntfamid, famid)
        msafile = msapath + os.sep + famid + msaext
        if not os.path.exists(msafile):
            print >> sys.stderr, "msafile %s does not exist. Ignore" % msafile
            continue
        (idList, annoList, seqList) = myfunc.ReadFasta(msafile)
        msaDict = {}
        for i in xrange(len(idList)):
            msaDict[idList[i]] = seqList[i]
        pairlist = pairlistDict[famid]
        #print "pairlist=", pairlist
        for i in xrange(len(pairlist)):
            pair = pairlist[i]
            #print "pair = ", pair
            seq1 = ""
            seq2 = ""
            id1 = pair[0]
            id2 = pair[1]
            if id1 in msaDict and id2 in msaDict:
                seq1 = msaDict[id1] 
                seq2 = msaDict[id2]
                [seq1, seq2] = lcmp.RemoveUnnecessaryGap([seq1, seq2])
                if len(seq1) != len(seq2):
                    print >> sys.stderr, "Bad alignment for %s and %s" %(id1,id2)
                else:
                    alignFactor = lcmp.GetAlignmentFactorFromPairAlignment(
                            seq1,seq2, isLocalAlignment)
                    pair.append(alignFactor)
            else:
                if id1 not in msaDict:
                    print >> sys.stderr, "%s not in msafile %s"%(id1, msafile)
                if id2 not in msaDict:
                    print >> sys.stderr, "%s not in msafile %s"%(id2, msafile)
    return 0
#!/usr/bin/env python
import os, sys, myfunc
from math import ceil

file_pairalnfile="/data3/wk/MPTopo/pfamAna_refpro/cellular_filter_all/pairwise/withinClan/Pfam-A-full.perTM75_nseq20.nr100.filtered.withinclan.max30000.kalignP.pairaln"

(idList, annoList, seqList) = myfunc.ReadFasta(file_pairalnfile)

numseq = len(idList)

outpath = "splitted"

os.system("mkdir -p %s"%outpath)

nsplit = 10

numPair = numseq / 2
pairPerSplit = int(ceil(float(numPair) / nsplit))

bp = 0
for i in xrange(nsplit):
    outfile=outpath + os.sep + "split_%d" %i + ".fa"
    fpout = open(outfile, "w")
    for p in range(bp, bp + pairPerSplit):
        if p < numPair:
            anno1 = annoList[2*p]
            anno2 = annoList[2*p+1]
            seq1 = seqList[2*p]
            seq2 = seqList[2*p+1]
            fpout.write(">%s\n"%anno1)
            fpout.write("%s\n"%seq1)
Example #10
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = "./"
    outfile = ""
    real_topofile = ""
    seqfile = ""
    restrictIDListFile = ""
    outfile_wrong_predtopo = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            print >> sys.stderr, "Error! Wrong argument:", argv[i]
            return 1
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile"]:
                (outfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-owrong", "--owrong"]:
                (outfile_wrong_predtopo, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-realtopo", "--realtopo"]:
                (real_topofile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqfile", "--seqfile"]:
                (seqfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-mode", "--mode"]:
                (g_params['mode'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-path_predtopo", "--path_predtopo"]:
                (g_params['path_predtopo'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-basename", "--basename"]:
                (g_params['basename'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-restrictidlist", "--restrictidlist"]:
                (restrictIDListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            elif argv[i] in ["-rmsp", "--rmsp"]:
                g_params['isRMSP'] = True
                i += 1
            elif argv[i] in ["-debug", "--debug"]:
                g_params['isDEBUG'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            print >> sys.stderr, "Error! Wrong argument:", argv[i]
            return 1
            i += 1

    if myfunc.checkfile(g_params['path_predtopo'], "path_predtopo") != 0:
        return 1
    if g_params['basename'] == "":
        print >> sys.stderr, "%s: basename not set. exit" % (argv[0])
        return 1
    if myfunc.checkfile(real_topofile, "real_topofile") != 0:
        return 1

    if restrictIDListFile != "":
        g_params['restrictIDset'] = set(myfunc.ReadIDList(restrictIDListFile))
        g_params['isRestrictIDList'] = True

    if g_params['mode'] == "":
        if g_params['path_predtopo'].find("topcons_single") >= 0:
            g_params['mode'] = "tps"
        elif g_params['path_predtopo'].find("topcons") >= 0:
            g_params['mode'] = "tp"
        else:
            print >> sys.stderr, "mode not set, and can not be recognized from path_predtopo=%s" % (
                path_predtopo)
            return 1

    if not g_params['mode'] in ["tp", "tps"]:
        print >> sys.stderr, "Unrecognized mode = %s" % (g_params['mode'])
        return 1

    (real_idlist, real_annolist,
     real_topolist) = myfunc.ReadFasta(real_topofile)
    seqDict = {}
    if seqfile != "" and os.path.exists(seqfile):
        (seq_idlist, seq_annolist, seqlist) = myfunc.ReadFasta(seqfile)
        for i in xrange(len(seq_idlist)):
            seqDict[seq_idlist[i]] = seqlist[i]

    if len(real_idlist) <= 0:
        print >> sys.stderr, "Failed to read real_topofile %s" % (
            real_topofile)
        return 1

    real_topodict = {}
    for i in xrange(len(real_idlist)):
        real_topodict[real_idlist[i]] = real_topolist[i]

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    fpout_wrong = myfunc.myopen(outfile_wrong_predtopo, None, "w", False)

    idSet_single = set([])
    idSet_multi = set([])
    for seqid in real_topodict:
        topo = real_topodict[seqid]
        numTM = myfunc.CountTM(topo)
        if numTM == 1:
            idSet_single.add(seqid)
        elif numTM > 1:
            idSet_multi.add(seqid)

#     print "len(real_topodict)", len(real_topodict)
#     print "len(idSet_single)", len(idSet_single)
#     print "len(idSet_multi)", len(idSet_multi)

#for TM_type in ["All_Alpha", "Single", "Multi"]:
    for TM_type in ["All_Alpha"]:
        if TM_type == "All_Alpha":
            sub_real_topodict = real_topodict
        else:
            sub_real_topodict = {}
            for seqid in real_topodict:
                topo = real_topodict[seqid]
                numTM = myfunc.CountTM(topo)
                if TM_type == "Single" and numTM == 1:
                    sub_real_topodict[seqid] = topo
                elif TM_type == "Multi" and numTM > 1:
                    sub_real_topodict[seqid] = topo
        Benchmark(sub_real_topodict, idSet_single, idSet_multi, TM_type, fpout,
                  fpout_wrong, seqDict)

    myfunc.myclose(fpout)
Example #11
0
def Benchmark(real_topodict, idSet_single, idSet_multi, TM_type, fpout,
              fpout_wrong, seqDict):  #{{{
    if g_params['mode'] == "tps":
        itemlist = ["40", "41", "42", "43", "44", "All"]
    elif g_params['mode'] == "tp":
        itemlist = ["50", "51", "52", "53", "54", "55", "All"]

    isRestrictIDList = g_params['isRestrictIDList']
    addname = ""
    if g_params['isRMSP']:
        addname = ".RMSP"

    numRealTopo = len(real_topodict)

    if isRestrictIDList:
        numRealTopo = len(g_params['restrictIDset']
                          & set(real_topodict.keys()))

    pred_topofile_list = []
    pred_topodict_list = []
    # Step 1, read in predicted topology
    for item in itemlist:
        pred_topofile = ""
        if item.upper() == "ALL":
            if g_params['mode'] == "tps":
                pred_topofile = "%s/%s.topcons-single_topcons_single%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], addname)
            elif g_params['mode'] == "tp":
                pred_topofile = "%s/%s.topcons.result_TOPCONS%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], addname)

        else:
            if g_params['mode'] == "tps":
                pred_topofile = "%s/%s_topcons_single.m1.agree-%s%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], item,
                    addname)
            elif g_params['mode'] == "tp":
                pred_topofile = "%s/%s.topcons.result_TOPCONS.m1.agree-%s%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], item,
                    addname)

        (pred_idlist, pred_annolist,
         pred_topolist) = myfunc.ReadFasta(pred_topofile)
        if len(pred_idlist) <= 0:
            print >> sys.stderr, "Failed to read pred_topofile %s" % (
                pred_topofile)
        pred_topodict = {}
        for i in xrange(len(pred_idlist)):
            if ((not isRestrictIDList)
                    or pred_idlist[i] in g_params['restrictIDset']):
                #if (TM_type == "All_Alpha" or (TM_type == "Single" and pred_idlist[i] in idSet_single) or (TM_type == "Multi" and pred_idlist[i] in idSet_multi)):
                pred_topodict[pred_idlist[i]] = pred_topolist[i]
        pred_topodict_list.append(pred_topodict)

# Step 2, calculate precision of the prediction
#header line
    fpout.write("#%s\n" % (TM_type))
    fpout.write("#%2s %7s %8s %8s %8s %8s %8s %8s %8s\n" %
                ("No", "Group", "nIDT", "nINV", "nPred", "PPV(%)", "NPV_INV",
                 "NPV_Other", "nAllReal"))
    for i in xrange(len(itemlist)):
        item = itemlist[i]
        pred_topodict = pred_topodict_list[i]
        numPredTopo = len(pred_topodict)

        (numIDTtopo,
         numINVtopo) = CountIdenticalTopology(pred_topodict, real_topodict,
                                              item, TM_type, fpout_wrong,
                                              seqDict, item)

        ss = "%-3d %7s %8d %8d %8d %8.1f %8.1f %8.1f %8d" % (
            i, item, numIDTtopo, numINVtopo, numPredTopo,
            myfunc.FloatDivision(numIDTtopo, numPredTopo) * 100.0,
            myfunc.FloatDivision(numINVtopo, numPredTopo) * 100.0,
            myfunc.FloatDivision(numPredTopo - numIDTtopo - numINVtopo,
                                 numPredTopo) * 100.0, numRealTopo)
        fpout.write("%s\n" % (ss))
    fpout.write("\n")
def main(g_params):#{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    infile = ""
    signalp_file = ""
    format_sp = "signalp"

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o"]:
                outfile = argv[i+1]
                i += 2
            elif argv[i] in ["-sp", "--sp"] :
                signalp_file = argv[i+1]
                i += 2
            elif argv[i] in ["-f", "--f", "-format", "--format"] :
                format_sp = argv[i+1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1

    if infile == "" or not os.path.exists(infile):
        print >> sys.stderr, "infile not set or does not exist"
        return 1
    if signalp_file == "" or not os.path.exists(signalp_file):
        print >> sys.stderr, "signalp file not set or does not exist"
        return 1
    if not format_sp in ["signalp", "phobius"]:
        print >> sys.stderr, "format_sp = %s is not supported. Exit." %(
                format_sp)

    
    signalpDict = ReadSignalPeptide(signalp_file, format_sp)
    (idList, annoList, topoList) = myfunc.ReadFasta(infile)
    
    newTopoList = MaskTopologyBySignalPeptide(idList, topoList, signalpDict)

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    for i in xrange(len(idList)):
        fpout.write(">%s\n"%(annoList[i]))
        fpout.write("%s\n"%(newTopoList[i]))

    myfunc.myclose(fpout)
Example #13
0
def ReadSeqDBDict(infile):  #{{{
    seqdbDict = {}
    (idList, annotationList, seqList) = myfunc.ReadFasta(infile)
    for i in xrange(len(idList)):
        seqdbDict[idList[i]] = (annotationList[i], seqList[i])
    return seqdbDict
Example #14
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    seqdbfile = ""
    infile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outfile", "--outfile"]:
                outfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-seqdb", "--seqdb"]:
                seqdbfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1
    if infile == "":
        print >> sys.stderr, "annotation file not set"
        return 1
    elif not os.path.exists(infile):
        print >> sys.stderr, "annotation file %s does not exist" % (infile)
        return 1
    if seqdbfile == "":
        print >> sys.stderr, "seqdbfile file not set"
        return 1
    elif not os.path.exists(seqdbfile):
        print >> sys.stderr, "seqdbfile file %s does not exist" % (seqdbfile)
        return 1
    seqDict = GetSeqDict(seqdbfile)
    if seqDict == {}:
        print >> sys.stderr, "Failed to read seqdbfile %s" % (seqdbfile)
        return 1
    (idList, annoList, contentList) = myfunc.ReadFasta(infile)
    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    for i in xrange(len(idList)):
        seqid = idList[i]
        try:
            seq = seqDict[seqid]
            fpout.write(">%s\n" % (annoList[i]))
            fpout.write("%s\n" % (seq))
            if contentList[i] != "":
                fpout.write("%s\n" % (contentList[i]))
        except KeyError:
            print >> sys.stderr, "seqid %s not found in seqdb" % (seqid)

    myfunc.myclose(fpout)
Example #15
0
# read in taxonomy def
if not os.path.exists(fastafile):
    print("Error! file fastafile (%s) does not exist." % fastafile,
          file=sys.stderr)
    sys.exit(1)
if not os.path.exists(treefile):
    print("Error! file treefile (%s) does not exist." % treefile,
          file=sys.stderr)
    sys.exit(1)

t = Tree(treefile)
leaves = t.get_leaves()
leafNameList = [x.name for x in leaves]
leafNameSet = set(leafNameList)
(idList, annotationList, seqList) = myfunc.ReadFasta(fastafile)

# write out taxdef
fpout = sys.stdout
numSeq = len(idList)

# write settings
dataset_settings = """\
TREE_COLORS
#use this template to define branch colors and styles, colored ranges and label colors/font styles/backgrounds
#lines starting with a hash are comments and ignored during parsing

#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).
Example #16
0
progname = os.path.basename(sys.argv[0])
general_usage = """ 
usage: %s TESTMODE options
""" % (sys.argv[0])

numArgv = len(sys.argv)
if numArgv <= 1:
    print(general_usage)
    sys.exit(1)
TESTMODE = sys.argv[1]

g_params = {}

if TESTMODE == "loadpil":
    g_params['font_dir'] = "%s/../fonts/truetype/ttf-dejavu/" % (rundir)
    g_params['font_size'] = 16
    fontpath = g_params['font_dir'] + "DejaVuSerif.ttf"
    print(fontpath)
    g_params['fntTMbox_label'] = ImageFont.truetype(fontpath, 10)

if TESTMODE == "getgapposition":
    topo = sys.argv[2]
    posGAP = myfunc.GetGapPosition(topo)
    print(posGAP)

if TESTMODE == "readfasta":
    seqfile = sys.argv[2]
    (idList, annoList, seqList) = myfunc.ReadFasta(seqfile)
    print(idList)
    print(seqList)
Example #17
0
def start_boctopus(infile, blastpath, modHome, hmmfilename, ws_cytosolic, ws_extracellular, ws_lipidfacing, ws_porefacing, \
    fakedbpath, dbpath, blastpgppath, hhsearchpath, hhblitspath, rpath):
    print "boctopus2 will start with ", infile

#     f = open(infile, "r")#{{{ DELETED
#     lines = f.readlines()
#     f.close()
# 
#     pname   = []
#     seqname = []
#     tempseq = ""
#     for line in lines:
#         line = line.strip()
# 
#         if line.startswith(">"):
#             pname.append(line[1:])
#             if len(tempseq) > 0:
#                 seqname.append(tempseq)
#             tempseq = ""
#         else:
#             tempseq += line
# 
#     if len(tempseq) > 0:
#         seqname.append(tempseq)
# 
#     print pname
#     print seqname
# 
#     if len(pname) != len(seqname):
#         print "number of pnames and seqs not the same."
#     else:#}}}

    # rewrite sequence reading part
    (seqidlist, seqannolist, seqlist) = myfunc.ReadFasta(infile)
    if len(seqidlist) <= 0:
        print >> sys.stderr, "No valid sequences read from file '%s'"%(infile)
        return 1

    #for i in range(0, len(pname)):
    for i in xrange(len(seqidlist)):
        seqid = seqidlist[i]
        seq = seqlist[i]
        seqanno = seqannolist[i]
        print "processing ", i , seqanno

        subtmpdir = "%s/seq_%d"%(tmpdir, i)
        if os.path.exists(subtmpdir):
            shutil.rmtree(subtmpdir)
        os.makedirs(subtmpdir)

        singleseqfile = "%s/query.fa"%(subtmpdir)
        myfunc.WriteFile(">%s\n%s\n"%(seqanno, seq), singleseqfile, mode="w", isFlush=True)

        if not os.path.exists(singleseqfile):
            print >> sys.stderr, "Failed to write to singleseqfile %s"%(singleseqfile)
            continue

        command = "python "+ "%s/boctopus_startHMM.py "%(rundir) + singleseqfile + " " + blastpath + " " + modHome + " " + hmmfilename + " " + ws_cytosolic + " " + ws_extracellular + " " + ws_lipidfacing + " " + ws_porefacing + " " + rpath+ " " +fakedbpath+\
" " + dbpath+ " " + blastpgppath+ " " + hhsearchpath + " " + hhblitspath
        print command
        os.system(command)
        outpath_this_seq = "%s/seq_%d"%(outpath, i)
        if not os.path.exists(outpath_this_seq):
            os.makedirs(outpath_this_seq)
        filepair_to_copy = [
                ("%s/query.fa"%subtmpdir, "%s/query.fa"%outpath_this_seq),
                ("%s/output/query_ioIOS.prf.txt_svm_topo.png"%subtmpdir, "%s/query.predict.png"%(outpath_this_seq)),
                ("%s/output/query_topologies.txt"%(subtmpdir), "%s/query_topologies.txt"%outpath_this_seq),
                ("%s/svmoutput/query_ioIOS.prf.txt"%subtmpdir, "%s/profile.txt"%outpath_this_seq),
                ("%s/pssm/query.filtered.pssmvals"%subtmpdir, "%s/pssm.txt"%(outpath_this_seq))

        ]
        for tup in filepair_to_copy:
            shutil.move(tup[0], tup[1])


    return
Example #18
0
def main():  #{{{
    if 0:  #{{{
        strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo"
        strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo"
        strProtein1 = "id1"
        strProtein2 = "id2"
        fpLog = sys.stdout
        class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew(
            strTop1, strTop2, strProtein1, strProtein2, fpLog)
        # Note: calling the int, float, string will not change their original value
        # calling the dict, list will change their original value
        print "strTop1:", strTop1
        print "strTop2:", strTop2
#}}}
    if 0:  #{{{
        PrintFuncName()
        print("this file name is: %s" % __file__)
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        lines = fp.readlines()
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        BLOCK_SIZE = 100000
        fp = open(filename, "r")
        buff = fp.read(BLOCK_SIZE)
        while buff:
            buff = fp.read(BLOCK_SIZE)
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        line = fp.readline()
        while line:
            line = fp.readline()
        fp.close()
        #}}}
    if 0:  #{{{
        try:
            BLOCK_SIZE = 100000
            infile = sys.argv[1]
            fpin = open(infile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                if len(recordList) > 0:
                    for record in recordList:
                        sys.stdout.write(">%s\n" % record[1])
                        sys.stdout.write("%s\n" % record[2])
                if isEOFreached == True:
                    break
            fpin.close()
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        try:
            infile = sys.argv[1]
            (annoList, seqList) = myfunc.ReadFasta_without_id(infile)
            for i in xrange(len(seqList)):
                sys.stdout.write(">%s\n" % annoList[i])
                sys.stdout.write("%s\n" % seqList[i])
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr"
        if IsDuplicatedByHHSearch(hhrfile):
            print "yes"

#}}}
    if 0:  #{{{
        import pairlistwithfamid2pairaln_by_msa
        seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV"
        seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP"
        seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment(
            seq1, seq2)
        print alignFactor
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{
        import my_extractdb
        #miniking my_extractdb.py see which one is faster
        try:
            dbname = sys.argv[1]
            idlistfile = sys.argv[2]
            cls = myfunc.MyDB(dbname)
            if cls.failure:
                print >> sys.stderr, "MyDB init failed"
            else:
                idlist = open(idlistfile, "r").read().split("\n")
                fpout = sys.stdout
                for seqid in idlist:
                    if seqid:
                        record = cls.GetRecord(seqid)
                        fpout.write(record)
            #             for rd in  cls.GetAllRecord():
            #                 print rd
#                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
#                print (seqid, anno, seq)
        except IndexError:
            print "error"
            pass
#}}}
    if 0:  #{{{ #test ReadLineByBlock
        try:
            infile = sys.argv[1]
            from myfunc import ReadLineByBlock
            cls = ReadLineByBlock(infile)
            lines = cls.readlines()
            while lines != None:
                for line in lines:
                    print line
                lines = cls.readlines()

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test speed of ReadLineByBlock
        # ReadLineByBlock is about 3 times fater than file.readline()
        try:
            from myfunc import ReadLineByBlock
            infile = sys.argv[1]

            start = time.time()
            hdl = ReadLineByBlock(infile)
            lines = hdl.readlines()
            while lines != None:
                lines = hdl.readlines()
            hdl.close()
            end = time.time()
            msg = "Reading %s by ReadLineByBlock costs %.3fs seconds"
            print msg % (infile, (end - start))

            start = time.time()
            hdl = open(infile, "r")
            line = hdl.readline()
            while line:
                line = hdl.readline()
            hdl.close()
            end = time.time()
            msg = "Reading %s by readline() costs %.3fs seconds"
            print msg % (infile, (end - start))

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test readline
        try:
            infile = sys.argv[1]
            fp = open(infile, "r")
            line = fp.readline()
            while line:
                print line
                line = fp.readline()
            fp.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test the speed of GetFirstWord
        try:
            nloop = int(sys.argv[1])
            string = "kjdafk jasdfj j"
            #string = "askdf askdf "
            #            string = "kajsdfasdfsdfjakasjdfka"
            #            string = "kajsdfasdf,sdfjakasjdfka"
            delimiter = " \t\r,.\n"
            delimiter = " "
            for i in xrange(nloop):
                #firstword = myfunc.GetFirstWord(string, delimiter)
                #firstword = string.split()[0]
                #firstword = string.partition(" ")[0]
                firstword = myfunc.GetFirstWord(string)
                #pass
                #print firstword
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ # read seq by SeqIO
        from Bio import SeqIO
        try:
            seqfile = sys.argv[1]
            # 1. SeqIO ####################
            start = time.time()
            handle = open(seqfile, "rU")
            cnt = 0
            for record in SeqIO.parse(handle, "fasta"):
                cnt += 1
            handle.close()
            end = time.time()
            msg = "Reading %d sequences by SeqIO costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 2. ReadFasta ####################
            start = time.time()
            seqfile = sys.argv[1]
            (idList, annoList, seqList) = myfunc.ReadFasta(seqfile)
            end = time.time()
            msg = "Reading %d sequences by ReadFasta costs %.3fs seconds"
            print msg % (len(idList), (end - start))

            # 3. ReadFasta from buffer
            BLOCK_SIZE = 100000
            start = time.time()
            cnt = 0
            fpin = open(seqfile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                cnt += len(recordList)
                if isEOFreached == True:
                    break
            fpin.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 4. ReadFastaByBlock ####################
            start = time.time()
            seqfile = sys.argv[1]
            hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0)
            if hdl.failure:
                print >> sys.stderr, "Failed to init ReadFastaByBlock"
                return 1
            recordList = hdl.readseq()
            cnt = 0
            while recordList != None:
                cnt += len(recordList)
                #                 for rd in recordList:
                #                     print ">%s"%rd.description
                #                     print rd.seq
                recordList = hdl.readseq()
            hdl.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds"
            print msg % (cnt, (end - start))
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ #test RemoveUnnecessaryGap
        try:
            infile = sys.argv[1]
            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)
            seqList = lcmp.RemoveUnnecessaryGap_old(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)

            seqList = lcmp.RemoveUnnecessaryGap(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test ReadMPAByBlock
        try:
            infile = sys.argv[1]
            hdl = myfunc.ReadMPAByBlock(infile)
            if hdl.failure:
                return
            recordList = hdl.readseq()
            while recordList != None:
                for rd in recordList:
                    #print rd.seqid
                    print ">%s" % (rd.description)
                    print "%s" % (myfunc.mpa2seq(rd.mpa))
                recordList = hdl.readseq()
            hdl.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{ #test subprocess
        import glob
        #invoke shell explicitly, not very good, may have security problems
        subprocess.call("seq 10", shell=True)
        subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True)
        subprocess.call("ls topo*.py", shell=True)
    if 1:  #{{{ #test subprocess
        import glob
        #invoke shell implicitly, recommended way
        subprocess.call(["seq", "10"], shell=False)
        subprocess.call(["echo", "wait for 1 seconds..."])
        subprocess.call(["sleep", "1"])
        try:
            print subprocess.check_call(["ls",
                                         "topo*.py"])  #This will not work
        except subprocess.CalledProcessError, e:
            print "error message:", e
        subprocess.call(["ls"] + glob.glob("topo*.py"))
Example #19
0
def WritePairAln(pairlistDict, msapath, msaext, outname):#{{{
    verbose = g_params['verbose']
    outAlnFile = outname + ".pairaln"
    outTableFile = outname + ".tableinfo"
    outSelPairList = outname + ".pairlistwithpfamid"
    try:
        fpout_aln = open(outAlnFile, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file", outAlnFile
        return 1
    try:
        fpout_table = open(outTableFile, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file", outTableFile
        return 1

    try:
        fpout_list = open(outSelPairList, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file", outSelPairList
        return 1

    fpout_table.write("#%-15s %-15s %6s %6s %9s %6s %6s %9s %6s %6s %6s %6s %6s\n" % (
        "Seq1","Seq2", "IDT0", "SIM0", "AlnLength", "Len1","Len2",
        "Score","N_IDT", "N_SIM", "N_GAP", "IDT1", "IDT2"))

    for famid in pairlistDict:
        if verbose >= 2:
            print "Write pairwise alignment for %s"%(famid)
        msafile = msapath + os.sep + famid + msaext
        if not os.path.exists(msafile):
            print >> sys.stderr, "msafile %s does not exist. Ignore" % msafile
            continue
        (idList, annoList, seqList) = myfunc.ReadFasta(msafile)
        msaDict = {}
        annoDict = {}
        for i in xrange(len(idList)):
            msaDict[idList[i]] = seqList[i]
            annoDict[idList[i]] = annoList[i]
        pairlist = pairlistDict[famid]
        #print "pairlist2=", pairlist
        for pair in pairlist:
            #print "pair2 = ", pair
            seq1 = ""
            seq2 = ""
            id1 = pair[0]
            id2 = pair[1]
            if id1 in msaDict and id2 in msaDict:
                seq1 = msaDict[id1] 
                seq2 = msaDict[id2]
                [seq1, seq2] = lcmp.RemoveUnnecessaryGap([seq1, seq2])
                if len(seq1) != len(seq2):
                    print >> sys.stderr, "Bad alignment for %s and %s" %(id1,id2)
                else:
                    rd = pair[2]
                    fpout_aln.write(">%s aligned_to=%s seqIDT=%.1f seqIDT1=%.1f\n"%(
                        annoDict[id1], id2, rd['seqidt0'], rd['seqidt1']))
                    fpout_aln.write("%s\n"%seq1)
                    fpout_aln.write(">%s aligned_to=%s seqIDT=%.1f seqIDT1=%.1f\n"%(
                        annoDict[id2], id1, rd['seqidt0'], rd['seqidt1']))
                    fpout_aln.write("%s\n"%seq2)
                    fpout_table.write("%-16s %-15s %6.1f %6.1f %9d %6d %6d %9.1f %6d %6d %6d %6.1f %6.1f\n"% (
                        id1, id2, rd['seqidt0'], -1.0,
                        rd['alnLength'],
                        rd['seqLength1'], rd['seqLength2'],
                        -1.0,
                        rd['numIDT'], -1, rd['numGap'],
                        rd['seqidt1'], rd['seqidt2']))
                    fpout_list.write("%s %s %s\n"%(id1, id2, famid))
    fpout_aln.close()
    fpout_table.close()
    fpout_list.close()
    print "Result output to "
    print "\t%s"%outAlnFile
    print "\t%s"%outTableFile

    return 0
Example #20
0
                PrintHelp();
                sys.exit(0);
            elif sys.argv[i] == "-i" or sys.argv[i] == "--infile":
                inFile=sys.argv[i+1];
                i = i + 2;
            elif sys.argv[i] == "-mintm" or sys.argv[i] == "--mintm":
                MIN_NUMTM=int(sys.argv[i+1]);
                i = i + 2;
            elif sys.argv[i] == "-o" or sys.argv[i] == "--out":
                outFile=sys.argv[i+1];
                i = i + 2;
            else:
                print >> sys.stderr,("Error! Wrong argument:%s" % sys.argv[i]);
                sys.exit(1);
        else:
            inFile=sys.argv[i];
            i+=1;
           

    if inFile == "":
        print >> sys.stderr,"Error! Topology file not set.";
        sys.exit(1);


    try :
        (idListTopo,annotationListTopo, topoList) = myfunc.ReadFasta(inFile);
        CleanSingleSpanTMPro(idListTopo, annotationListTopo, topoList);
    except :
        print >>sys.stderr, "except for the input file: %s" % inFile;
        raise ;
Example #21
0
def DrawPairwiseTopo(pairtopoAlnFile, aaSeqDict, pairCmpclassDict, outpath):
    (idList, annoList, seqList) = myfunc.ReadFasta(pairtopoAlnFile)
    numSeq = len(idList)
    numPair = numSeq / 2

    print "numSeq = ", numSeq
    print "numPair = ", numPair

    for i in range(numPair):
        id1 = idList[2 * i]
        id2 = idList[2 * i + 1]
        if len(seqList[2 * i]) != len(seqList[2 * i + 1]):
            print "Error for %s - %s " % (idList[2 * i], idList[2 * i + 1])
            continue
        basename = "%s-%s" % (id1, id2)

        isSatisfied = True
        #         if basename in pairCmpclassDict:
        #             if g_params['cmpclassList'] != []:
        #                 if (not pairCmpclassDict[basename] in
        #                         g_params['cmpclassList']):
        #                     isSatisfied = False
        #             elif pairCmpclassDict[basename] == 'OK':
        #                 isSatisfied = False
        if isSatisfied:
            outPairAlnFile = outpath + os.sep + "%s.topoaln.fa" % (basename)
            fpout = open(outPairAlnFile, 'w')
            print >> fpout, ">%s" % annoList[2 * i]
            print >> fpout, "%s" % seqList[2 * i]
            print >> fpout, ">%s" % annoList[2 * i + 1]
            print >> fpout, "%s" % seqList[2 * i + 1]
            fpout.close()
            outAASeqFile = outpath + os.sep + "%s.fa" % (basename)
            fpout = open(outAASeqFile, "w")
            if id1 in aaSeqDict:
                print >> fpout, ">%s" % id1
                print >> fpout, "%s" % aaSeqDict[id1]
            if id2 in aaSeqDict:
                print >> fpout, ">%s" % id2
                print >> fpout, "%s" % aaSeqDict[id2]
            fpout.close()

            # Output dgscan file
            dgpfile = outpath + os.sep + basename + '.dgscan'
            cmd = "%s %s -lmin 21 -lmax 21 -o %s" % (dgscanprog, outAASeqFile,
                                                     dgpfile)
            os.system(cmd)

            outpngfile = outpath + os.sep + "%s.topoaln.png" % basename
            outShrinkedFile = (outpath + os.sep +
                               "%s.topoaln.shrinked.png" % basename)
            thumb_outShrinkedFile = (outpath + os.sep + 'thumb.' +
                                     "%s.topoaln.shrinked.png" % basename)
            outNonShrinkedFile = (outpath + os.sep +
                                  "%s.topoaln.nonshrinked.png" % basename)
            thumb_outNonShrinkedFile = (
                outpath + os.sep + 'thumb.' +
                "%s.topoaln.nonshrinked.png" % basename)
            os.system(
                "python %s/drawMSATopo.py %s -pfm no -shrink yes -method mat" %
                (binpath, outPairAlnFile))
            os.system("mv %s %s" % (outpngfile, outShrinkedFile))
            os.system(
                "python %s/drawMSATopo.py %s -pfm no -shrink no -pdg yes -method yes -dgpfile %s"
                % (binpath, outPairAlnFile, dgpfile))
            os.system("mv %s %s" % (outpngfile, outNonShrinkedFile))
            os.system("convert -thumbnail 200 %s %s" %
                      (outShrinkedFile, thumb_outShrinkedFile))
            os.system("convert -thumbnail 200 %s %s" %
                      (outNonShrinkedFile, thumb_outNonShrinkedFile))
            os.system("rm -f %s %s" % (outAASeqFile, dgpfile))
Example #22
0
def DumpPredictionTOPCONS2(seqfile, path_result, outfile, isWriteDG,
                           isWriteRel):  #{{{
    (seqidlist, seqannolist, seqlist) = myfunc.ReadFasta(seqfile)
    outfile_fa = "%s.fa" % (outfile)
    outfile_unfinished_fa = "%s.unfinished.fa" % (outfile)
    numseq = len(seqidlist)

    fpout = None
    try:
        fpout = open(outfile, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile)
        return 1

    fpout_fa = None
    try:
        fpout_fa = open(outfile_fa, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile_fa)
        return 1

    fpout_unfinished_fa = None
    try:
        fpout_unfinished_fa = open(outfile_unfinished_fa, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (
            outfile_unfinished_fa)
        return 1

    methodlist = [
        'TOPCONS', 'OCTOPUS', 'Philius', 'PolyPhobius', 'SCAMPI', 'SPOCTOPUS',
        'Homology'
    ]

    cntUnFinished = 0
    for iseq in xrange(len(seqidlist)):
        seq = seqlist[iseq]
        length = len(seq)
        desp = seqannolist[iseq]
        if g_params['resultPathFormat'] == "md5":
            md5_key2 = hashlib.md5(seq + "\n").hexdigest()
            md5_key1 = hashlib.md5(seq).hexdigest()
            subdirname = "seq_%d" % (0)
            isFound = False
            for md5_key in [md5_key1, md5_key2]:
                dir1 = md5_key[:2]
                dir2 = md5_key[2:4]
                datapath_this_seq = "%s%s%s%s%s%s%s" % (
                    path_result, os.sep, dir1, os.sep, dir2, os.sep, md5_key)
                subdir = "%s/%s" % (datapath_this_seq, subdirname)
                if os.path.exists(subdir):
                    break
        else:
            subdirname = "seq_%d" % (iseq)
            subdir = "%s/%s" % (path_result, subdirname)

        if g_params['verbose']:
            print "subdir = %s" % (subdir)

        rstfile = "%s/Topcons/topcons.top" % (subdir)
        if os.path.exists(rstfile):
            print >> fpout, "Sequence number: %d" % (iseq + 1)
            print >> fpout, "Sequence name: %s" % (desp)
            print >> fpout, "Sequence length: %d aa." % (length)
            print >> fpout, "Sequence:\n%s\n\n" % (seq)
            topo_consensus = ""
            for i in xrange(len(methodlist)):
                method = methodlist[i]
                seqid = ""
                seqanno = ""
                top = ""
                if method == "TOPCONS":
                    topfile = "%s/%s/topcons.top" % (subdir, "Topcons")
                elif method == "Philius":
                    topfile = "%s/%s/query.top" % (subdir, "philius")
                elif method == "SCAMPI":
                    topfile = "%s/%s/query.top" % (subdir, method + "_MSA")
                else:
                    topfile = "%s/%s/query.top" % (subdir, method)
                if os.path.exists(topfile):
                    (seqid, seqanno, top) = myfunc.ReadSingleFasta(topfile)
                else:
                    top = ""
                if top == "":
                    #top = "***No topology could be produced with this method topfile=%s***"%(topfile)
                    top = "***No topology could be produced with this method***"

                if method == "TOPCONS":
                    topo_consensus = top

                if method == "Homology":
                    showtext_homo = method
                    if seqid != "":
                        showtext_homo = seqid
                    print >> fpout, "%s:\n%s\n\n" % (showtext_homo, top)
                else:
                    print >> fpout, "%s predicted topology:\n%s\n\n" % (method,
                                                                        top)

            if isWriteDG:
                dgfile = "%s/dg.txt" % (subdir)
                dg_content = ""
                if os.path.exists(dgfile):
                    dg_content = myfunc.ReadFile(dgfile)
                lines = dg_content.split("\n")
                dglines = []
                for line in lines:
                    if line and line[0].isdigit():
                        dglines.append(line)
                if len(dglines) > 0:
                    print >> fpout,  "\nPredicted Delta-G-values (kcal/mol) "\
                            "(left column=sequence position; right column=Delta-G)\n"
                    print >> fpout, "\n".join(dglines)

            if isWriteRel:
                reliability_file = "%s/Topcons/reliability.txt" % (subdir)
                reliability = ""
                if os.path.exists(reliability_file):
                    reliability = myfunc.ReadFile(reliability_file)
                if reliability != "":
                    print >> fpout, "\nPredicted TOPCONS reliability (left "\
                            "column=sequence position; right column=reliability)\n"
                    print >> fpout, reliability

            print >> fpout, "##############################################################################"

            # write the concensus prediction in FASTA format
            print >> fpout_fa, ">%s" % (desp)
            print >> fpout_fa, topo_consensus

        else:
            # write unfinished
            fpout_unfinished_fa.write(">%s\n%s\n" % (desp, seq))
            cntUnFinished += 1

    if cntUnFinished > 1:
        print >> sys.stderr, "%s out of %d sequences are with unfinished predictions, please check." % (
            cntUnFinished, numseq)

    for fp in [fpout, fpout_fa, fpout_unfinished_fa]:
        if fp:
            try:
                fp.close()
            except IOError:
                pass

    return 0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = "./"
    idListFile = None
    idList = []
    seqfile = ""
    topofile = ""
    max_dist = 12  # maximal distance to the TM helix so that K, R residues are counted
    flank_win = 5  # flanking window of the TM helix, residues at position
    #TMbeg-flank_win and TMend+flank_win are also counted

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            idList.append(argv[i])
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outpath", "--outpath"]:
                (outpath, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-maxdist", "--maxdist"]:
                (max_dist, i) = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-flankwin", "--flankwin"]:
                (flank_win, i) = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-seqfile", "--seqfile"]:
                (seqfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-topofile", "--topofile"]:
                (topofile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-l", "--l"]:
                (idListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            elif argv[i] in ["-debug"]:
                g_params['isDEBUG'] = True
                i += 1
            else:
                print("Error! Wrong argument:", argv[i], file=sys.stderr)
                return 1
        else:
            idList.append(argv[i])
            i += 1
    (idListSeq, annoListSeq, seqList) = myfunc.ReadFasta(seqfile)
    (idListTopo, annoListTopo, topoList) = myfunc.ReadFasta(topofile)

    numSeq = len(idListSeq)
    numTopo = len(idListTopo)
    if numSeq < 1 or numTopo < 1:
        print("No seq set", file=sys.stderr)
        return 1
    seqDict = {}
    for i in range(numSeq):
        seqDict[idListSeq[i]] = seqList[i]
    topoDict = {}
    for i in range(numTopo):
        topoDict[idListTopo[i]] = topoList[i]

    cmpclassDict = {}
    for anno in annoListTopo:
        anno = anno.lstrip(">")
        strs = anno.split()
        cmpclassDict[strs[0]] = strs[1]

    outpath = os.path.dirname(seqfile)
    if outpath == "":
        outpath = "."
    rootname = os.path.basename(os.path.splitext(seqfile)[0])
    outfile_kr_list = outpath + os.sep + rootname + ".krlist.txt"
    outfile_krbias = outpath + os.sep + rootname + ".krbias.txt"

    fpout_krlist = open(outfile_kr_list, "w")
    fpout_krbias = open(outfile_krbias, "w")

    for idd in idListSeq:
        if g_params['isDEBUG']:
            print("seqid: %s" % (idd))
        try:
            topo = topoDict[idd]
        except KeyError:
            print("no topo for %s" % idd, file=sys.stderr)
            continue
        try:
            seq = seqDict[idd]
        except KeyError:
            print("no seq for %s" % idd, file=sys.stderr)
            continue
        try:
            cmpclass = cmpclassDict[idd]
        except KeyError:
            cmpclass = "INV"
        (kr_bias, KR_pos_list, numTM) = CalKRBias(seq, topo, flank_win,
                                                  max_dist)
        WriteResult(idd, cmpclass, seq, numTM, kr_bias, KR_pos_list,
                    fpout_krlist)
        if cmpclass in ["IDT", "INV"]:
            fpout_krbias.write("%d\n" % kr_bias)
    fpout_krlist.close()
    fpout_krbias.close()
Example #24
0
def main():#{{{
    numArgv = len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    global isPrintSeqID
    outFile=""
    inFile=""
    fastaFile=""

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            isNonOptionArg=False
            i = i + 1
        elif sys.argv[i] == "--":
            isNonOptionArg=True
            i = i + 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] ==  "-h" or  sys.argv[i] == "--help":
                PrintHelp()
                return 1
            elif sys.argv[i] == "-i" or sys.argv[i] == "--infile":
                inFile=sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] == "-f" or sys.argv[i] == "--fasta":
                fastaFile=sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] == "-printid" or sys.argv[i] == "--printid":
                if (sys.argv[i+1].lower())[0] == "y": 
                    isPrintSeqID=True
                else:
                    isPrintSeqID=False
                i = i + 2
            elif sys.argv[i] == "-o" or sys.argv[i] == "--outfile":
                outFile=sys.argv[i+1]
                i = i + 2
            else:
                print >> sys.stderr,("Error! Wrong argument:%s" % sys.argv[i])
                return 1
        else:
            inFile=sys.argv[i]
            i+=1

    if inFile == "":
        print >> sys.stderr,"Error! Topology file not set."
        return 1
    if fastaFile == "":
        print >> sys.stderr,"Error!  amino acid fasta file not set."
        return 1

    fpout = sys.stdout
    if outFile != "":
        fpout = open(outFile,"w")
        if not fpout:
            print >> sys.stderr, "Failed to write to outfile %s. "%(outFile)
            print >> sys.stderr, "Reset output to stdout."
            fpout = sys.stdout
    sizeAASeqFile = os.path.getsize(fastaFile)

    if sizeAASeqFile > MAX_FASTA_AA_FILE_SIZE:
        print >> sys.stderr, ("size (%d)"%sizeAASeqFile 
                + " of fasta sequence file (%s)"%fastaFile
                + " is over the limit (%d). Exit."% MAX_FASTA_AA_FILE_SIZE)
        return 1

    (idListSeq, annotationListSeq, seqList) = myfunc.ReadFasta(fastaFile)
    if idListSeq == None:
        print >> sys.stderr, "%s exit with error."%sys.argv[0]
        return 1
    elif idListSeq < 1:
        print >> sys.stderr, ("Warning! zero aa sequences have" 
                + " been read in for file %s" %fastaFile)
    aaSeqDict={}
    for i in xrange (len(idListSeq)):
        aaSeqDict[idListSeq[i]] = seqList[i]


    fpin = open (inFile, "rb")
    if not fpin:
        print >> sys.stderr, "Failed to open input file %s"%(inFile)
        return -1
    unprocessedBuffer=""
    isEOFreached = False
    processedTopoIDSet = set([])
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if len(buff) < BLOCK_SIZE:
            isEOFreached=True
        buff = unprocessedBuffer + buff
        recordList = []
        unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff,recordList, isEOFreached)
        if len(recordList) > 0: 
            idListTopo = [r[0] for r in recordList]
            topoList = [r[2] for r in recordList]
            Topo2TMFrag(idListTopo, topoList,aaSeqDict, processedTopoIDSet, fpout)
        if isEOFreached == True:
            break
    fpin.close()

    if fpout != None and fpout != sys.stdout:
        fpout.close()
Example #25
0
def MakeTMplot(seqAlnFile, topAlnFile, outpath, tmpdir):# {{{
    """Make topology plot for TM family.
    """
    rootname = os.path.basename(os.path.splitext(seqAlnFile)[0])
    basename_seqAlnFile = os.path.basename(seqAlnFile)
    basename_topAlnFile = os.path.basename(topAlnFile)
    ext_topAlnFile = os.path.splitext(topAlnFile)[1].lstrip('.')

    shutil.copy2(seqAlnFile, os.path.join(tmpdir, basename_seqAlnFile))
    shutil.copy2(topAlnFile, os.path.join(tmpdir, basename_topAlnFile))
    cwd = os.getcwd()

    os.chdir(tmpdir)
    # generate topology one line plot
    cmd = [python_exec, os.path.join(rundir, "drawMSATopo.py"), "-m-shrink",
        str(0), "-method", "pil",  "-pfm", "no", "-text", "n",  "-pdg", "n",
        "-pfm", "n",  "-pmsa", "y", "-ptag", "y", "-showTMidx", "-sep", "n",
        "--advtopo",   "-cleanplot", "-h2wratio", str(g_params["H2W_ratio"]),
        "-shrink", "no", "-showgap", basename_topAlnFile]

    if g_params['verbose']:
        print(("Generating toplogy alignment figure for %s"%(rootname)))

    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1
    topalnfigure = "%s.png"%(rootname)
    if not os.path.exists(topalnfigure):
        return 1
    # resize the figure file
    resized_topalnfigure = "%s.s%d.png"%(rootname, g_params['figure_resize'])
    shutil.copy2(topalnfigure, resized_topalnfigure)
    cmd = ["mogrify", "-resize", str(g_params['figure_resize']), resized_topalnfigure]
    if g_params['verbose']:
        print(("Resizing the topology alignment figure for %s"%(rootname)))
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1

    # generate seqaln figure
    seqaln_htmlfigure = "%s.%s"%(rootname, "seqaln.html")
    cmd = [python_exec, os.path.join(rundir, "write_seqaln_colorTM.py"),
            basename_seqAlnFile, "-ext-topomsa", ext_topAlnFile, "-ws",
            str(g_params['window_size']), "-o",
            seqaln_htmlfigure, "-cleanplot", "-rmgap"]
    if g_params['isBreakTM']:
        cmd += ["-breakTM"]

    if g_params['verbose']:
        print(("Generating sequence alignment highlighted by TM regions for %s"%(rootname)))
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1

    # convert html to pdf
    seqaln_pdffigure = "%s.%s"%(rootname, "seqaln.pdf")
    cmd = ["wkhtmltopdf",  seqaln_htmlfigure, seqaln_pdffigure]
    if os_dist.lower() in ["debian", "ubuntu"]:
        cmd = ["xvfb-run"] + cmd
    if g_params['verbose']:
        print("Convert the html figure to PDF for sequence alignment")
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1
    # crop the PDF figure
    cmd = ["pdfcrop", seqaln_pdffigure]
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1

    seqaln_pdffigure_crop =  "%s.%s"%(rootname, "seqaln-crop.pdf")

    # merge figures
    (seqIDList, seqAnnoList, seqList) = myfunc.ReadFasta(basename_seqAlnFile)
    str_evalue = ""
    if len(seqAnnoList) > 0:
        str_evalue = seqAnnoList[0].split('/')[-1]

    outfile = "%s.seqtopaln.pdf"%(rootname)
    cmd = ["bash", os.path.join(rundir, "merge_tmplot.sh"),
            resized_topalnfigure, seqaln_pdffigure_crop, "-cap", 
            "%s"%(rootname), "-o", outfile]
    capList = []
    for i in range(len(seqIDList)):
        capList += ["-cap", "%s: %s"%(alphabet[i], seqIDList[i])]
    cmd += capList
    if g_params['verbose']:
        print(("Merging the topology alignment figure and sequence alignment figure for %s"%(rootname)))
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1

    # copy the pdf figure generated by latex to a tmp file (a hack for the
    # PDFcrop
    tmpoutfile = "tt1.pdf"
    shutil.copy2(outfile, tmpoutfile)

    # crop the merged PDF figure
    cmd = ["pdfcrop", tmpoutfile]
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1

    outfile_crop =  "tt1-crop.pdf"

    if os.path.exists(outfile_crop):
        final_targetfile =  os.path.join(outpath, "%s.seqtopaln.pdf"%(rootname))
        shutil.copy2(outfile_crop, final_targetfile)

    if g_params['verbose']:
        print(("Copy the result to final target %s"%(os.path.join(outpath, outfile))))


    os.chdir(cwd)

    return 0
Example #26
0
        sys.exit(1)
    if topoWithDGScoreFile == "" and dgscanFile == "":
        print >> sys.stderr, "Error! Either topoWithDGScoreFile or dgscanFile should be set."
        sys.exit(1)
    if topoWithDGScoreFile != "" and dgscanFile != "":
        print >> sys.stderr, "Error! Only one of the topoWithDGScoreFile and dgscanFile can be set."
        sys.exit(1)

    fpout = sys.stdout
    if outFile != "":
        fpout = open(outFile, "w")

    try:
        gapopenList = []
        topoWithDGScoreList = []
        (idListSeq, annotationListSeq, seqList) = myfunc.ReadFasta(fastaFile)
        if topoWithDGScoreFile != "":
            (topoWithDGScoreList,
             indexID) = ReadTopoWithDGScore(topoWithDGScoreFile)
            gapopenList = GetGapOpenValues(topoWithDGScoreList)
            if not (len(gapopenList) == len(idListSeq)
                    and len(gapopenList) == len(topoWithDGScoreList)):
                print >> sys.stderr, "length mismatch"
                print >> sys.stderr, "len(gapopenList)=", len(gapopenList)
                print >> sys.stderr, "len(idListSeq)=", len(idListSeq)
                print >> sys.stderr, "len(topoWithDGScoreList)=", len(
                    topoWithDGScoreList)
                sys.exit(1)
        elif dgscanFile != "":
            (dgscanList, indexID) = ReadDGScan(dgscanFile)
            gapopenList = GetGapOpenValuesFromDGScan(dgscanList)
def RunJob(infile, outpath, tmpdir, email, jobid, g_params):  #{{{
    all_begin_time = time.time()

    rootname = os.path.basename(os.path.splitext(infile)[0])
    starttagfile = "%s/runjob.start" % (outpath)
    runjob_errfile = "%s/runjob.err" % (outpath)
    runjob_logfile = "%s/runjob.log" % (outpath)
    finishtagfile = "%s/runjob.finish" % (outpath)
    rmsg = ""

    resultpathname = jobid

    outpath_result = "%s/%s" % (outpath, resultpathname)
    tarball = "%s.tar.gz" % (resultpathname)
    zipfile = "%s.zip" % (resultpathname)
    tarball_fullpath = "%s.tar.gz" % (outpath_result)
    zipfile_fullpath = "%s.zip" % (outpath_result)
    outfile = "%s/%s/Topcons/topcons.top" % (outpath_result, "seq_%d" % (0))
    resultfile_text = "%s/%s" % (outpath_result, "query.result.txt")
    mapfile = "%s/seqid_index_map.txt" % (outpath_result)
    finished_seq_file = "%s/finished_seqs.txt" % (outpath_result)

    tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname)
    isOK = True
    try:
        os.makedirs(tmp_outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (tmp_outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    try:
        os.makedirs(outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    if isOK:
        try:
            open(finished_seq_file, 'w').close()
        except:
            pass
#first getting result from caches
# ==================================

        maplist = []
        maplist_simple = []
        toRunDict = {}
        hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0)
        if hdl.failure:
            isOK = False
        else:
            datetime = time.strftime("%Y-%m-%d %H:%M:%S")
            rt_msg = myfunc.WriteFile(datetime, starttagfile)

            recordList = hdl.readseq()
            cnt = 0
            origpath = os.getcwd()
            while recordList != None:
                for rd in recordList:
                    isSkip = False
                    # temp outpath for the sequence is always seq_0, and I feed
                    # only one seq a time to the workflow
                    tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result,
                                                      "seq_%d" % 0)
                    outpath_this_seq = "%s/%s" % (outpath_result,
                                                  "seq_%d" % cnt)
                    subfoldername_this_seq = "seq_%d" % (cnt)
                    if os.path.exists(tmp_outpath_this_seq):
                        try:
                            shutil.rmtree(tmp_outpath_this_seq)
                        except OSError:
                            pass

                    maplist.append(
                        "%s\t%d\t%s\t%s" %
                        ("seq_%d" % cnt, len(rd.seq), rd.description, rd.seq))
                    maplist_simple.append(
                        "%s\t%d\t%s" %
                        ("seq_%d" % cnt, len(rd.seq), rd.description))
                    if not g_params['isForceRun']:
                        md5_key = hashlib.md5(rd.seq).hexdigest()
                        subfoldername = md5_key[:2]
                        md5_link = "%s/%s/%s" % (path_md5cache, subfoldername,
                                                 md5_key)
                        if os.path.exists(md5_link):
                            # create a symlink to the cache
                            rela_path = os.path.relpath(
                                md5_link, outpath_result)  #relative path
                            os.chdir(outpath_result)
                            os.symlink(rela_path, subfoldername_this_seq)

                            if os.path.exists(outpath_this_seq):
                                runtime = 0.0  #in seconds
                                topfile = "%s/%s/topcons.top" % (
                                    outpath_this_seq, "Topcons")
                                top = myfunc.ReadFile(topfile).strip()
                                numTM = myfunc.CountTM(top)
                                posSP = myfunc.GetSPPosition(top)
                                if len(posSP) > 0:
                                    isHasSP = True
                                else:
                                    isHasSP = False
                                info_finish = [
                                    "seq_%d" % cnt,
                                    str(len(rd.seq)),
                                    str(numTM),
                                    str(isHasSP), "cached",
                                    str(runtime), rd.description
                                ]
                                myfunc.WriteFile("\t".join(info_finish) + "\n",
                                                 finished_seq_file,
                                                 "a",
                                                 isFlush=True)
                                isSkip = True

                    if not isSkip:
                        # first try to delete the outfolder if exists
                        if os.path.exists(outpath_this_seq):
                            try:
                                shutil.rmtree(outpath_this_seq)
                            except OSError:
                                pass
                        origIndex = cnt
                        numTM = 0
                        toRunDict[origIndex] = [rd.seq, numTM, rd.description
                                                ]  #init value for numTM is 0

                    cnt += 1
                recordList = hdl.readseq()
            hdl.close()
        myfunc.WriteFile("\n".join(maplist_simple) + "\n", mapfile)

        # run scampi single to estimate the number of TM helices and then run
        # the query sequences in the descending order of numTM
        torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa")
        dumplist = []
        for key in toRunDict:
            top = toRunDict[key][0]
            dumplist.append(">%s\n%s" % (str(key), top))
        myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w")
        del dumplist

        topfile_scampiseq = "%s/%s" % (tmp_outpath_result,
                                       "query.torun.fa.topo")
        if os.path.exists(torun_all_seqfile):
            # run scampi to estimate the number of TM helices
            cmd = [
                script_scampi, torun_all_seqfile, "-outpath",
                tmp_outpath_result
            ]
            try:
                rmsg = subprocess.check_output(cmd)
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e) + "\n")
                pass
        if os.path.exists(topfile_scampiseq):
            (idlist_scampi, annolist_scampi,
             toplist_scampi) = myfunc.ReadFasta(topfile_scampiseq)
            for jj in xrange(len(idlist_scampi)):
                numTM = myfunc.CountTM(toplist_scampi[jj])
                try:
                    toRunDict[int(idlist_scampi[jj])][1] = numTM
                except (KeyError, ValueError, TypeError):
                    pass

        sortedlist = sorted(toRunDict.items(),
                            key=lambda x: x[1][1],
                            reverse=True)
        #format of sortedlist [(origIndex: [seq, numTM, description]), ...]

        # submit sequences one by one to the workflow according to orders in
        # sortedlist

        for item in sortedlist:
            #             g_params['runjob_log'].append("tmpdir = %s"%(tmpdir))
            #cmd = [script_getseqlen, infile, "-o", tmp_outfile , "-printid"]
            origIndex = item[0]
            seq = item[1][0]
            description = item[1][2]

            outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % origIndex)
            tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" %
                                              (0))
            if os.path.exists(tmp_outpath_this_seq):
                try:
                    shutil.rmtree(tmp_outpath_this_seq)
                except OSError:
                    pass

            seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" %
                                          (origIndex))
            seqcontent = ">%d\n%s\n" % (origIndex, seq)
            myfunc.WriteFile(seqcontent, seqfile_this_seq, "w")

            if not os.path.exists(seqfile_this_seq):
                g_params['runjob_err'].append(
                    "failed to generate seq index %d" % (origIndex))
                continue

            cmd = [
                runscript, seqfile_this_seq, tmp_outpath_result, blastdir,
                blastdb
            ]
            g_params['runjob_log'].append(" ".join(cmd))
            begin_time = time.time()
            try:
                rmsg = subprocess.check_output(cmd)
                g_params['runjob_log'].append("workflow:\n" + rmsg + "\n")
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e) + "\n")
                g_params['runjob_err'].append(rmsg + "\n")
                pass
                #suqoutfilelist = glob.glob("%s/*.sh.*.out"%(tmpdir))
                #if len(suqoutfilelist)>0:
                #    suqoutfile = suqoutfilelist[0]
                #g_params['runjob_err'].append(myfunc.ReadFile(suqoutfile))
            end_time = time.time()
            runtime_in_sec = end_time - begin_time

            if os.path.exists(tmp_outpath_this_seq):
                cmd = ["mv", "-f", tmp_outpath_this_seq, outpath_this_seq]
                isCmdSuccess = False
                try:
                    subprocess.check_output(cmd)
                    isCmdSuccess = True
                except subprocess.CalledProcessError, e:
                    msg = "Failed to run prediction for sequence No. %d\n" % (
                        origIndex)
                    g_params['runjob_err'].append(msg)
                    g_params['runjob_err'].append(str(e) + "\n")
                    pass
                timefile = "%s/time.txt" % (tmp_outpath_result)
                targetfile = "%s/time.txt" % (outpath_this_seq)
                if os.path.exists(timefile) and os.path.exists(
                        outpath_this_seq):
                    try:
                        shutil.move(timefile, targetfile)
                    except:
                        g_params['runjob_err'].append(
                            "Failed to move %s/time.txt" %
                            (tmp_outpath_result) + "\n")
                        pass

                if isCmdSuccess:
                    runtime = runtime_in_sec  #in seconds
                    topfile = "%s/%s/topcons.top" % (outpath_this_seq,
                                                     "Topcons")
                    top = myfunc.ReadFile(topfile).strip()
                    numTM = myfunc.CountTM(top)
                    posSP = myfunc.GetSPPosition(top)
                    if len(posSP) > 0:
                        isHasSP = True
                    else:
                        isHasSP = False
                    info_finish = [
                        "seq_%d" % origIndex,
                        str(len(seq)),
                        str(numTM),
                        str(isHasSP), "newrun",
                        str(runtime), description
                    ]
                    myfunc.WriteFile("\t".join(info_finish) + "\n",
                                     finished_seq_file,
                                     "a",
                                     isFlush=True)
                    # now write the text output for this seq

                    info_this_seq = "%s\t%d\t%s\t%s" % (
                        "seq_%d" % origIndex, len(seq), description, seq)
                    resultfile_text_this_seq = "%s/%s" % (outpath_this_seq,
                                                          "query.result.txt")
                    myfunc.WriteTOPCONSTextResultFile(resultfile_text_this_seq,
                                                      outpath_result,
                                                      [info_this_seq],
                                                      runtime_in_sec,
                                                      g_params['base_www_url'])
                    # create or update the md5 cache
                    # create cache only on the front-end
                    if g_params['base_www_url'].find("topcons.net") != -1:
                        md5_key = hashlib.md5(seq).hexdigest()
                        subfoldername = md5_key[:2]
                        md5_subfolder = "%s/%s" % (path_md5cache,
                                                   subfoldername)
                        md5_link = "%s/%s/%s" % (path_md5cache, subfoldername,
                                                 md5_key)
                        if os.path.exists(md5_link):
                            try:
                                os.unlink(md5_link)
                            except:
                                pass
                        subfolder_md5 = "%s/%s" % (path_md5cache,
                                                   subfoldername)
                        if not os.path.exists(subfolder_md5):
                            try:
                                os.makedirs(subfolder_md5)
                            except:
                                pass

                        rela_path = os.path.relpath(
                            outpath_this_seq, md5_subfolder)  #relative path
                        try:
                            os.chdir(md5_subfolder)
                            os.symlink(rela_path, md5_key)
                        except:
                            pass
Example #28
0
def main(g_params):

    numArgv=len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outFile = ""
    orderlistfile = ""
    msafile = ""
    outformat = "fasta" # fasta or anno

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            msafile = sys.argv[i]
            isNonOptionArg=False
            i = i + 1
        elif sys.argv[i] == "--":
            isNonOptionArg=True
            i = i + 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] ==  "-h" or  sys.argv[i] == "--help":
                PrintHelp()
                return 1
            elif sys.argv[i] in [ "-o", "--o"] :
                outFile=sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] == "-orderlist" or sys.argv[i] == "--orderlist":
                orderlistfile = sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] == "-msafile" or sys.argv[i] == "--msafile":
                msafile = sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] in ["-of", "--of", "-outformat", "--outformat"]:
                outformat = sys.argv[i+1].lower()
                i += 2
            else:
                print(("Error! Wrong argument:%s" % sys.argv[i]), file=sys.stderr)
                return 1
        else:
            msafile = sys.argv[i]
            i+=1

    if not outformat in ["anno", "fasta"]:
        print("Unrecognized outformat \"%s\","%(
                outformat) + " should be either \"anno\" or \"fasta\".", file=sys.stderr)
        return 1

    if orderlistfile == "":
        print("orderlist file not set. Exit", file=sys.stderr)
        return 1
    if msafile == "":
        print("msafile not set. Exit", file=sys.stderr)
    orderList = ReadOrderList(orderlistfile)  
    (idList, annoList, seqList) = myfunc.ReadFasta(msafile)

    if len(orderList) > 0  and len(idList) > 0:
        fpout = sys.stdout
        fpout = myfunc.myopen(outFile, sys.stdout, "w", False)

        seqDict = {}
        annoDict = {}
        numSeq = len(idList)
        for i in range(numSeq):
            annoDict[idList[i]] = annoList[i]
        if outformat != "anno":
            for i in range(numSeq):
                seqDict[idList[i]] = seqList[i]
        for sid in orderList:
            if sid in annoDict:
                fpout.write(">%s\n"%annoDict[sid])
                if outformat != "anno":
                    fpout.write("%s\n"%seqDict[sid])
            else:
                print("seqid %s not in msafile %s"%(
                        sid, msafile), file=sys.stderr)
        myfunc.myclose(fpout)

    return 0
def DumpPredictionTOPCONS2(seqfile, path_result, outfile, isWriteDG,
                           isWriteRel):  #{{{
    (seqidlist, seqannolist, seqlist) = myfunc.ReadFasta(seqfile)
    outfile_fa = "%s.fa" % (outfile)

    fpout = None
    try:
        fpout = open(outfile, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile)
        return 1

    fpout_fa = None
    try:
        fpout_fa = open(outfile_fa, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile_fa)
        return 1

    methodlist = [
        'TOPCONS', 'OCTOPUS', 'Philius', 'PolyPhobius', 'SCAMPI', 'SPOCTOPUS',
        'Homology'
    ]

    for i in xrange(len(seqidlist)):
        subdirname = "seq_%d" % (i)
        subdir = "%s/%s" % (path_result, subdirname)
        seq = seqlist[i]
        length = len(seq)
        desp = seqannolist[i]
        print >> fpout, "Sequence number: %d" % (i + 1)
        print >> fpout, "Sequence name: %s" % (desp)
        print >> fpout, "Sequence length: %d aa." % (length)
        print >> fpout, "Sequence:\n%s\n\n" % (seq)
        topo_consensus = ""
        for i in xrange(len(methodlist)):
            method = methodlist[i]
            seqid = ""
            seqanno = ""
            top = ""
            if method == "TOPCONS":
                topfile = "%s/%s/topcons.top" % (subdir, "Topcons")
            elif method == "Philius":
                topfile = "%s/%s/query.top" % (subdir, "philius")
            elif method == "SCAMPI":
                topfile = "%s/%s/query.top" % (subdir, method + "_MSA")
            else:
                topfile = "%s/%s/query.top" % (subdir, method)
            if os.path.exists(topfile):
                (seqid, seqanno, top) = myfunc.ReadSingleFasta(topfile)
            else:
                top = ""
            if top == "":
                #top = "***No topology could be produced with this method topfile=%s***"%(topfile)
                top = "***No topology could be produced with this method***"

            if method == "TOPCONS":
                topo_consensus = top

            if method == "Homology":
                showtext_homo = method
                if seqid != "":
                    showtext_homo = seqid
                print >> fpout, "%s:\n%s\n\n" % (showtext_homo, top)
            else:
                print >> fpout, "%s predicted topology:\n%s\n\n" % (method,
                                                                    top)

        if isWriteDG:
            dgfile = "%s/dg.txt" % (subdir)
            dg_content = ""
            if os.path.exists(dgfile):
                dg_content = myfunc.ReadFile(dgfile)
            lines = dg_content.split("\n")
            dglines = []
            for line in lines:
                if line and line[0].isdigit():
                    dglines.append(line)
            if len(dglines) > 0:
                print >> fpout,  "\nPredicted Delta-G-values (kcal/mol) "\
                        "(left column=sequence position; right column=Delta-G)\n"
                print >> fpout, "\n".join(dglines)

        if isWriteRel:
            reliability_file = "%s/Topcons/reliability.txt" % (subdir)
            reliability = ""
            if os.path.exists(reliability_file):
                reliability = myfunc.ReadFile(reliability_file)
            if reliability != "":
                print >> fpout, "\nPredicted TOPCONS reliability (left "\
                        "column=sequence position; right column=reliability)\n"
                print >> fpout, reliability

        print >> fpout, "##############################################################################"

        # write the concensus prediction in FASTA format
        print >> fpout_fa, ">%s" % (desp)
        print >> fpout_fa, topo_consensus

    if fpout:
        try:
            fpout.close()
        except IOError:
            pass
    if fpout_fa:
        try:
            fpout_fa.close()
        except IOError:
            pass

    return 0