Ejemplo n.º 1
0
def DrawSeqMSA(seqmsafile, outpath):
    print "Remove gaps from sequence"
    (idList, annotationList, seqList) = myfunc.ReadFasta(seqmsafile)
    rootname = os.path.basename(os.path.splitext(seqmsafile)[0])
    basename = os.path.basename(seqmsafile)
    seqfile = outpath + os.sep + rootname + '.fa'
    fpout = open(seqfile, "w")
    for i in xrange(len(idList)):
        fpout.write(">%s\n" % annotationList[i])
        fpout.write("%s\n" % seqList[i].replace("-", "").replace(".", ""))
    fpout.close()

    print "Predicting topologies..."
    scampi_exe = "%s/mySCAMPI_run.pl" % g_params['newscampiscriptpath']
    scampi_dir = g_params['scampi_dir']
    modhmm_bin = g_params['modhmm_bin']
    cmd = "%s %s --scampipath %s --modhmmpath %s --outpath %s" % (
        scampi_exe, seqfile, scampi_dir, modhmm_bin, outpath)
    os.system(cmd)
    os.system("rm -f %s/*.res" % outpath)

    print "Get topomsa"
    binpath = g_params['binpath']
    topofile = outpath + os.sep + rootname + '.fa.topo'
    topomsafile = outpath + os.sep + rootname + '.topomsa.fa'
    cmd = "%s/matchMSAtopo -msa %s -topo %s -o %s" % (binpath, seqmsafile,
                                                      topofile, topomsafile)
    os.system(cmd)

    print "Draw topomsa"
    cmd = "python %s/drawMSATopo.py %s -text y -outpath %s -aaseq %s" % (
        binpath, topomsafile, outpath, seqfile)
    os.system(cmd)
Ejemplo n.º 2
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    topofile = ""
    outfile = ""
    isGapLess = False

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            topofile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o"]:
                outfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-i", "--i"]:
                topofile = argv[i + 1]
                i += 2
            elif argv[i] in ["-gapless", "--gapless"]:
                isGapLess = True
                i += 1
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            topofile = argv[i]
            i += 1
    if topofile == "":
        print >> sys.stderr, "topofile not set. exit"
        return 1
    try:
        (idList, annoList, seqList) = myfunc.ReadFasta(topofile)
        fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
        for i in xrange(len(idList)):
            topo = seqList[i]
            seqid = idList[i]
            if isGapLess:
                topo = topo.replace("-", "").replace(".", "")
            posTMList = myfunc.GetTMPosition(topo)
            print >> fpout, seqid, posTMList
        myfunc.myclose(fpout)
    except (IOError, IndexError):
        pass
Ejemplo n.º 3
0
def WriteSeqAlnHTML(seqAlnFileList, extTopoMSA, outfile):  # {{{
    try:
        fpout = open(outfile, "w")
    except IOError:
        print("Failed to write to %s" % (outfile), file=sys.stderr)
        return 1
    WriteHTMLHeader(
        'Alignment highlighted by <font color=%s>TM regions</font>' % ('red'),
        fpout)
    print("Processed alignments:")
    for alnfile in seqAlnFileList:
        rootname_alnfile = os.path.basename(os.path.splitext(alnfile)[0])
        topomsafile = '.'.join([os.path.splitext(alnfile)[0], extTopoMSA])
        if not (os.path.exists(alnfile) and os.path.exists(topomsafile)):
            if not os.path.exists(alnfile):
                sys.stderr.write('alnfile %s does not exist\n' % (alnfile))
            if not os.path.exists(topomsafile):
                sys.stderr.write('topomsafile %s does not exist\n' %
                                 (topomsafile))
            continue
        (seqIDList, seqAnnoList, seqList) = myfunc.ReadFasta(alnfile)
        #print(seqIDList)
        (topoIDList, topoAnnoList, topoList) = myfunc.ReadFasta(topomsafile)
        #print(topoIDList)
        if g_params['removeUnnecessaryGap']:
            seqList = lcmp.RemoveUnnecessaryGap(seqList)
            topoList = lcmp.RemoveUnnecessaryGap(topoList)

        # since there is no shrinking, index map is always p->p
        final2seq_idxMapList = []
        for i in range(len(seqIDList)):
            seqlength = len(seqList[i])
            idxmap = {}
            for j in range(seqlength):
                idxmap[j] = j
            final2seq_idxMapList.append(idxmap)

        print(('\t' + rootname_alnfile))
        WriteHTMLAlignment2(rootname_alnfile, topoIDList, topoAnnoList,
                            topoList, topoList, seqList, final2seq_idxMapList,
                            fpout)

    WriteHTMLTail(fpout)

    fpout.close()
    return 0
Ejemplo n.º 4
0
def MatchTopoPairAln(queryTopoFile,alignFile, targetsTopologyFile, fpout):#{{{
#     fptmp=open(queryTopoFile);
#     print fptmp.readlines();
#     fptmp.close();
    try:
        (queryID, queryAnnotation, queryTopology) = myfunc.ReadSingleFasta(queryTopoFile);
        # read in alignment
        alns = ReadNeedleAlignment(alignFile);

        # read in topologys
        (targetIDList, targetAnnotationList, targetTopoList) = myfunc.ReadFasta(targetsTopologyFile);

        # match and print the result
        print >> fpout, "#Number of alignments: %d" % len(targetIDList);

        for i in range (len(targetIDList)):
            seqID=targetIDList[i];
            alnseq1=alns[i]['alnseq1'];
            alnseq2=alns[i]['alnseq2'];
            topoaln1="";
            topoaln2="";

            if seqID != alns[i]['seqid2']:
                print >> sys.stderr, "seqID does not match, record %d" %i;

            cnt1=0;
            cnt2=0;
            for j in range(len(alnseq1)):
                if alnseq1[j] != '-':
                    if alnseq2[j] != '-':
                        topoaln1+=queryTopology[cnt1];
                        topoaln2+=targetTopoList[i][cnt2];
                    else:
                        topoaln1+=queryTopology[cnt1];
                        topoaln2+='-';
                else:
                    if alnseq2[j] != '-':
                        topoaln1+='-';
                        topoaln2+=targetTopoList[i][cnt2];
                    else:
                        topoaln1+='-';
                        topoaln2+='-';
                if alnseq1[j] != '-':
                    cnt1 +=1;
                if alnseq2[j] != '-':
                    cnt2 += 1;
            #print the result
            print >> fpout, "#Topology alignment %d" %( i+1);
            print >> fpout, ">%s" % queryAnnotation;
            print >> fpout, "%s" % topoaln1;
            print >> fpout, ">%s" % targetAnnotationList[i];
            print >> fpout, "%s" % topoaln2;
            print >> fpout;
    except: 
        print >>sys.stderr, "except for the function:%s"%sys._getframe().f_code.co_name ;
        raise ;
    return 0;
Ejemplo n.º 5
0
def GetPairTopoAln(pairalnTopoFile):#{{{
    (idList, annoList, seqList) = myfunc.ReadFasta(pairalnTopoFile);
    numPair = len(idList)/2;
    pairTopoAlnDict = {};
    for i in xrange(numPair):
        pair = {};
        pair['id1'] = idList[i*2];
        pair['id2'] = idList[i*2+1];
        pair['anno1'] = annoList[i*2];
        pair['anno2'] = annoList[i*2+1];
        pair['seq1'] = seqList[i*2];
        pair['seq2'] = seqList[i*2+1];
        key = "%s-%s"%(idList[i*2], idList[i*2+1]);
        pairTopoAlnDict[key] = pair;
    return pairTopoAlnDict;
Ejemplo n.º 6
0
def RandFasta(inFile, N, rand_seed, fpout):  #{{{
    (idList, annotationList, seqList) = myfunc.ReadFasta(inFile, BLOCK_SIZE)
    if idList == None:
        print("Failed to read fastafile %s. Exit." % inFile, file=sys.stderr)
        return -1
    random.seed(rand_seed)
    Nseq = len(idList)
    if N > Nseq:
        N = Nseq
    idxArray = list(range(Nseq))
    idxSample = random.sample(idxArray, N)
    for i in range(N):
        idx = idxSample[i]
        fpout.write(">%s\n" % annotationList[idx])
        fpout.write("%s\n" % seqList[idx])
    return 0
Ejemplo n.º 7
0
def action(method, alnfile, outfile):
    (seqidList, seqAnnoList, seqList) = myfunc.ReadFasta(alnfile)
    if (method == 0):
        newSeqList = lcmp.RemoveUnnecessaryGap_old(seqList)
    else:
        newSeqList = lcmp.RemoveUnnecessaryGap(seqList)
    try:
        if outfile == "":
            fpout = sys.stdout
        else:
            fpout = open(outfile, "w")
        for i in range(len(seqidList)):
            fpout.write(">%s\n" % (seqAnnoList[i]))
            fpout.write("%s\n" % (newSeqList[i]))
        if fpout and fpout != sys.stdout:
            fpout.close()
        return 0
    except IOError:
        click.echo("Failed to write to file %s" % (outfile))
        return 1
Ejemplo n.º 8
0
def AddPairwiseAlignmentFactor(pairlistDict, msapath, msaext, #{{{
        isLocalAlignment):
    cntfamid = 0
    verbose = g_params['verbose']
    for famid in pairlistDict:
        cntfamid += 1
        if verbose >= 2:
            print "Add pairwise alignment factor for %d: %s"%(cntfamid, famid)
        msafile = msapath + os.sep + famid + msaext
        if not os.path.exists(msafile):
            print >> sys.stderr, "msafile %s does not exist. Ignore" % msafile
            continue
        (idList, annoList, seqList) = myfunc.ReadFasta(msafile)
        msaDict = {}
        for i in xrange(len(idList)):
            msaDict[idList[i]] = seqList[i]
        pairlist = pairlistDict[famid]
        #print "pairlist=", pairlist
        for i in xrange(len(pairlist)):
            pair = pairlist[i]
            #print "pair = ", pair
            seq1 = ""
            seq2 = ""
            id1 = pair[0]
            id2 = pair[1]
            if id1 in msaDict and id2 in msaDict:
                seq1 = msaDict[id1] 
                seq2 = msaDict[id2]
                [seq1, seq2] = lcmp.RemoveUnnecessaryGap([seq1, seq2])
                if len(seq1) != len(seq2):
                    print >> sys.stderr, "Bad alignment for %s and %s" %(id1,id2)
                else:
                    alignFactor = lcmp.GetAlignmentFactorFromPairAlignment(
                            seq1,seq2, isLocalAlignment)
                    pair.append(alignFactor)
            else:
                if id1 not in msaDict:
                    print >> sys.stderr, "%s not in msafile %s"%(id1, msafile)
                if id2 not in msaDict:
                    print >> sys.stderr, "%s not in msafile %s"%(id2, msafile)
    return 0
Ejemplo n.º 9
0
#!/usr/bin/env python
import os, sys, myfunc
from math import ceil

file_pairalnfile="/data3/wk/MPTopo/pfamAna_refpro/cellular_filter_all/pairwise/withinClan/Pfam-A-full.perTM75_nseq20.nr100.filtered.withinclan.max30000.kalignP.pairaln"

(idList, annoList, seqList) = myfunc.ReadFasta(file_pairalnfile)

numseq = len(idList)

outpath = "splitted"

os.system("mkdir -p %s"%outpath)

nsplit = 10

numPair = numseq / 2
pairPerSplit = int(ceil(float(numPair) / nsplit))

bp = 0
for i in xrange(nsplit):
    outfile=outpath + os.sep + "split_%d" %i + ".fa"
    fpout = open(outfile, "w")
    for p in range(bp, bp + pairPerSplit):
        if p < numPair:
            anno1 = annoList[2*p]
            anno2 = annoList[2*p+1]
            seq1 = seqList[2*p]
            seq2 = seqList[2*p+1]
            fpout.write(">%s\n"%anno1)
            fpout.write("%s\n"%seq1)
Ejemplo n.º 10
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = "./"
    outfile = ""
    real_topofile = ""
    seqfile = ""
    restrictIDListFile = ""
    outfile_wrong_predtopo = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            print >> sys.stderr, "Error! Wrong argument:", argv[i]
            return 1
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile"]:
                (outfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-owrong", "--owrong"]:
                (outfile_wrong_predtopo, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-realtopo", "--realtopo"]:
                (real_topofile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqfile", "--seqfile"]:
                (seqfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-mode", "--mode"]:
                (g_params['mode'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-path_predtopo", "--path_predtopo"]:
                (g_params['path_predtopo'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-basename", "--basename"]:
                (g_params['basename'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-restrictidlist", "--restrictidlist"]:
                (restrictIDListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            elif argv[i] in ["-rmsp", "--rmsp"]:
                g_params['isRMSP'] = True
                i += 1
            elif argv[i] in ["-debug", "--debug"]:
                g_params['isDEBUG'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            print >> sys.stderr, "Error! Wrong argument:", argv[i]
            return 1
            i += 1

    if myfunc.checkfile(g_params['path_predtopo'], "path_predtopo") != 0:
        return 1
    if g_params['basename'] == "":
        print >> sys.stderr, "%s: basename not set. exit" % (argv[0])
        return 1
    if myfunc.checkfile(real_topofile, "real_topofile") != 0:
        return 1

    if restrictIDListFile != "":
        g_params['restrictIDset'] = set(myfunc.ReadIDList(restrictIDListFile))
        g_params['isRestrictIDList'] = True

    if g_params['mode'] == "":
        if g_params['path_predtopo'].find("topcons_single") >= 0:
            g_params['mode'] = "tps"
        elif g_params['path_predtopo'].find("topcons") >= 0:
            g_params['mode'] = "tp"
        else:
            print >> sys.stderr, "mode not set, and can not be recognized from path_predtopo=%s" % (
                path_predtopo)
            return 1

    if not g_params['mode'] in ["tp", "tps"]:
        print >> sys.stderr, "Unrecognized mode = %s" % (g_params['mode'])
        return 1

    (real_idlist, real_annolist,
     real_topolist) = myfunc.ReadFasta(real_topofile)
    seqDict = {}
    if seqfile != "" and os.path.exists(seqfile):
        (seq_idlist, seq_annolist, seqlist) = myfunc.ReadFasta(seqfile)
        for i in xrange(len(seq_idlist)):
            seqDict[seq_idlist[i]] = seqlist[i]

    if len(real_idlist) <= 0:
        print >> sys.stderr, "Failed to read real_topofile %s" % (
            real_topofile)
        return 1

    real_topodict = {}
    for i in xrange(len(real_idlist)):
        real_topodict[real_idlist[i]] = real_topolist[i]

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    fpout_wrong = myfunc.myopen(outfile_wrong_predtopo, None, "w", False)

    idSet_single = set([])
    idSet_multi = set([])
    for seqid in real_topodict:
        topo = real_topodict[seqid]
        numTM = myfunc.CountTM(topo)
        if numTM == 1:
            idSet_single.add(seqid)
        elif numTM > 1:
            idSet_multi.add(seqid)

#     print "len(real_topodict)", len(real_topodict)
#     print "len(idSet_single)", len(idSet_single)
#     print "len(idSet_multi)", len(idSet_multi)

#for TM_type in ["All_Alpha", "Single", "Multi"]:
    for TM_type in ["All_Alpha"]:
        if TM_type == "All_Alpha":
            sub_real_topodict = real_topodict
        else:
            sub_real_topodict = {}
            for seqid in real_topodict:
                topo = real_topodict[seqid]
                numTM = myfunc.CountTM(topo)
                if TM_type == "Single" and numTM == 1:
                    sub_real_topodict[seqid] = topo
                elif TM_type == "Multi" and numTM > 1:
                    sub_real_topodict[seqid] = topo
        Benchmark(sub_real_topodict, idSet_single, idSet_multi, TM_type, fpout,
                  fpout_wrong, seqDict)

    myfunc.myclose(fpout)
Ejemplo n.º 11
0
def Benchmark(real_topodict, idSet_single, idSet_multi, TM_type, fpout,
              fpout_wrong, seqDict):  #{{{
    if g_params['mode'] == "tps":
        itemlist = ["40", "41", "42", "43", "44", "All"]
    elif g_params['mode'] == "tp":
        itemlist = ["50", "51", "52", "53", "54", "55", "All"]

    isRestrictIDList = g_params['isRestrictIDList']
    addname = ""
    if g_params['isRMSP']:
        addname = ".RMSP"

    numRealTopo = len(real_topodict)

    if isRestrictIDList:
        numRealTopo = len(g_params['restrictIDset']
                          & set(real_topodict.keys()))

    pred_topofile_list = []
    pred_topodict_list = []
    # Step 1, read in predicted topology
    for item in itemlist:
        pred_topofile = ""
        if item.upper() == "ALL":
            if g_params['mode'] == "tps":
                pred_topofile = "%s/%s.topcons-single_topcons_single%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], addname)
            elif g_params['mode'] == "tp":
                pred_topofile = "%s/%s.topcons.result_TOPCONS%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], addname)

        else:
            if g_params['mode'] == "tps":
                pred_topofile = "%s/%s_topcons_single.m1.agree-%s%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], item,
                    addname)
            elif g_params['mode'] == "tp":
                pred_topofile = "%s/%s.topcons.result_TOPCONS.m1.agree-%s%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], item,
                    addname)

        (pred_idlist, pred_annolist,
         pred_topolist) = myfunc.ReadFasta(pred_topofile)
        if len(pred_idlist) <= 0:
            print >> sys.stderr, "Failed to read pred_topofile %s" % (
                pred_topofile)
        pred_topodict = {}
        for i in xrange(len(pred_idlist)):
            if ((not isRestrictIDList)
                    or pred_idlist[i] in g_params['restrictIDset']):
                #if (TM_type == "All_Alpha" or (TM_type == "Single" and pred_idlist[i] in idSet_single) or (TM_type == "Multi" and pred_idlist[i] in idSet_multi)):
                pred_topodict[pred_idlist[i]] = pred_topolist[i]
        pred_topodict_list.append(pred_topodict)

# Step 2, calculate precision of the prediction
#header line
    fpout.write("#%s\n" % (TM_type))
    fpout.write("#%2s %7s %8s %8s %8s %8s %8s %8s %8s\n" %
                ("No", "Group", "nIDT", "nINV", "nPred", "PPV(%)", "NPV_INV",
                 "NPV_Other", "nAllReal"))
    for i in xrange(len(itemlist)):
        item = itemlist[i]
        pred_topodict = pred_topodict_list[i]
        numPredTopo = len(pred_topodict)

        (numIDTtopo,
         numINVtopo) = CountIdenticalTopology(pred_topodict, real_topodict,
                                              item, TM_type, fpout_wrong,
                                              seqDict, item)

        ss = "%-3d %7s %8d %8d %8d %8.1f %8.1f %8.1f %8d" % (
            i, item, numIDTtopo, numINVtopo, numPredTopo,
            myfunc.FloatDivision(numIDTtopo, numPredTopo) * 100.0,
            myfunc.FloatDivision(numINVtopo, numPredTopo) * 100.0,
            myfunc.FloatDivision(numPredTopo - numIDTtopo - numINVtopo,
                                 numPredTopo) * 100.0, numRealTopo)
        fpout.write("%s\n" % (ss))
    fpout.write("\n")
Ejemplo n.º 12
0
def main(g_params):#{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    infile = ""
    signalp_file = ""
    format_sp = "signalp"

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o"]:
                outfile = argv[i+1]
                i += 2
            elif argv[i] in ["-sp", "--sp"] :
                signalp_file = argv[i+1]
                i += 2
            elif argv[i] in ["-f", "--f", "-format", "--format"] :
                format_sp = argv[i+1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1

    if infile == "" or not os.path.exists(infile):
        print >> sys.stderr, "infile not set or does not exist"
        return 1
    if signalp_file == "" or not os.path.exists(signalp_file):
        print >> sys.stderr, "signalp file not set or does not exist"
        return 1
    if not format_sp in ["signalp", "phobius"]:
        print >> sys.stderr, "format_sp = %s is not supported. Exit." %(
                format_sp)

    
    signalpDict = ReadSignalPeptide(signalp_file, format_sp)
    (idList, annoList, topoList) = myfunc.ReadFasta(infile)
    
    newTopoList = MaskTopologyBySignalPeptide(idList, topoList, signalpDict)

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    for i in xrange(len(idList)):
        fpout.write(">%s\n"%(annoList[i]))
        fpout.write("%s\n"%(newTopoList[i]))

    myfunc.myclose(fpout)
Ejemplo n.º 13
0
def ReadSeqDBDict(infile):  #{{{
    seqdbDict = {}
    (idList, annotationList, seqList) = myfunc.ReadFasta(infile)
    for i in xrange(len(idList)):
        seqdbDict[idList[i]] = (annotationList[i], seqList[i])
    return seqdbDict
Ejemplo n.º 14
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    seqdbfile = ""
    infile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outfile", "--outfile"]:
                outfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-seqdb", "--seqdb"]:
                seqdbfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1
    if infile == "":
        print >> sys.stderr, "annotation file not set"
        return 1
    elif not os.path.exists(infile):
        print >> sys.stderr, "annotation file %s does not exist" % (infile)
        return 1
    if seqdbfile == "":
        print >> sys.stderr, "seqdbfile file not set"
        return 1
    elif not os.path.exists(seqdbfile):
        print >> sys.stderr, "seqdbfile file %s does not exist" % (seqdbfile)
        return 1
    seqDict = GetSeqDict(seqdbfile)
    if seqDict == {}:
        print >> sys.stderr, "Failed to read seqdbfile %s" % (seqdbfile)
        return 1
    (idList, annoList, contentList) = myfunc.ReadFasta(infile)
    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    for i in xrange(len(idList)):
        seqid = idList[i]
        try:
            seq = seqDict[seqid]
            fpout.write(">%s\n" % (annoList[i]))
            fpout.write("%s\n" % (seq))
            if contentList[i] != "":
                fpout.write("%s\n" % (contentList[i]))
        except KeyError:
            print >> sys.stderr, "seqid %s not found in seqdb" % (seqid)

    myfunc.myclose(fpout)
Ejemplo n.º 15
0
# read in taxonomy def
if not os.path.exists(fastafile):
    print("Error! file fastafile (%s) does not exist." % fastafile,
          file=sys.stderr)
    sys.exit(1)
if not os.path.exists(treefile):
    print("Error! file treefile (%s) does not exist." % treefile,
          file=sys.stderr)
    sys.exit(1)

t = Tree(treefile)
leaves = t.get_leaves()
leafNameList = [x.name for x in leaves]
leafNameSet = set(leafNameList)
(idList, annotationList, seqList) = myfunc.ReadFasta(fastafile)

# write out taxdef
fpout = sys.stdout
numSeq = len(idList)

# write settings
dataset_settings = """\
TREE_COLORS
#use this template to define branch colors and styles, colored ranges and label colors/font styles/backgrounds
#lines starting with a hash are comments and ignored during parsing

#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).
Ejemplo n.º 16
0
progname = os.path.basename(sys.argv[0])
general_usage = """ 
usage: %s TESTMODE options
""" % (sys.argv[0])

numArgv = len(sys.argv)
if numArgv <= 1:
    print(general_usage)
    sys.exit(1)
TESTMODE = sys.argv[1]

g_params = {}

if TESTMODE == "loadpil":
    g_params['font_dir'] = "%s/../fonts/truetype/ttf-dejavu/" % (rundir)
    g_params['font_size'] = 16
    fontpath = g_params['font_dir'] + "DejaVuSerif.ttf"
    print(fontpath)
    g_params['fntTMbox_label'] = ImageFont.truetype(fontpath, 10)

if TESTMODE == "getgapposition":
    topo = sys.argv[2]
    posGAP = myfunc.GetGapPosition(topo)
    print(posGAP)

if TESTMODE == "readfasta":
    seqfile = sys.argv[2]
    (idList, annoList, seqList) = myfunc.ReadFasta(seqfile)
    print(idList)
    print(seqList)
Ejemplo n.º 17
0
def start_boctopus(infile, blastpath, modHome, hmmfilename, ws_cytosolic, ws_extracellular, ws_lipidfacing, ws_porefacing, \
    fakedbpath, dbpath, blastpgppath, hhsearchpath, hhblitspath, rpath):
    print "boctopus2 will start with ", infile

#     f = open(infile, "r")#{{{ DELETED
#     lines = f.readlines()
#     f.close()
# 
#     pname   = []
#     seqname = []
#     tempseq = ""
#     for line in lines:
#         line = line.strip()
# 
#         if line.startswith(">"):
#             pname.append(line[1:])
#             if len(tempseq) > 0:
#                 seqname.append(tempseq)
#             tempseq = ""
#         else:
#             tempseq += line
# 
#     if len(tempseq) > 0:
#         seqname.append(tempseq)
# 
#     print pname
#     print seqname
# 
#     if len(pname) != len(seqname):
#         print "number of pnames and seqs not the same."
#     else:#}}}

    # rewrite sequence reading part
    (seqidlist, seqannolist, seqlist) = myfunc.ReadFasta(infile)
    if len(seqidlist) <= 0:
        print >> sys.stderr, "No valid sequences read from file '%s'"%(infile)
        return 1

    #for i in range(0, len(pname)):
    for i in xrange(len(seqidlist)):
        seqid = seqidlist[i]
        seq = seqlist[i]
        seqanno = seqannolist[i]
        print "processing ", i , seqanno

        subtmpdir = "%s/seq_%d"%(tmpdir, i)
        if os.path.exists(subtmpdir):
            shutil.rmtree(subtmpdir)
        os.makedirs(subtmpdir)

        singleseqfile = "%s/query.fa"%(subtmpdir)
        myfunc.WriteFile(">%s\n%s\n"%(seqanno, seq), singleseqfile, mode="w", isFlush=True)

        if not os.path.exists(singleseqfile):
            print >> sys.stderr, "Failed to write to singleseqfile %s"%(singleseqfile)
            continue

        command = "python "+ "%s/boctopus_startHMM.py "%(rundir) + singleseqfile + " " + blastpath + " " + modHome + " " + hmmfilename + " " + ws_cytosolic + " " + ws_extracellular + " " + ws_lipidfacing + " " + ws_porefacing + " " + rpath+ " " +fakedbpath+\
" " + dbpath+ " " + blastpgppath+ " " + hhsearchpath + " " + hhblitspath
        print command
        os.system(command)
        outpath_this_seq = "%s/seq_%d"%(outpath, i)
        if not os.path.exists(outpath_this_seq):
            os.makedirs(outpath_this_seq)
        filepair_to_copy = [
                ("%s/query.fa"%subtmpdir, "%s/query.fa"%outpath_this_seq),
                ("%s/output/query_ioIOS.prf.txt_svm_topo.png"%subtmpdir, "%s/query.predict.png"%(outpath_this_seq)),
                ("%s/output/query_topologies.txt"%(subtmpdir), "%s/query_topologies.txt"%outpath_this_seq),
                ("%s/svmoutput/query_ioIOS.prf.txt"%subtmpdir, "%s/profile.txt"%outpath_this_seq),
                ("%s/pssm/query.filtered.pssmvals"%subtmpdir, "%s/pssm.txt"%(outpath_this_seq))

        ]
        for tup in filepair_to_copy:
            shutil.move(tup[0], tup[1])


    return
Ejemplo n.º 18
0
def main():  #{{{
    if 0:  #{{{
        strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo"
        strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo"
        strProtein1 = "id1"
        strProtein2 = "id2"
        fpLog = sys.stdout
        class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew(
            strTop1, strTop2, strProtein1, strProtein2, fpLog)
        # Note: calling the int, float, string will not change their original value
        # calling the dict, list will change their original value
        print "strTop1:", strTop1
        print "strTop2:", strTop2
#}}}
    if 0:  #{{{
        PrintFuncName()
        print("this file name is: %s" % __file__)
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        lines = fp.readlines()
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        BLOCK_SIZE = 100000
        fp = open(filename, "r")
        buff = fp.read(BLOCK_SIZE)
        while buff:
            buff = fp.read(BLOCK_SIZE)
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        line = fp.readline()
        while line:
            line = fp.readline()
        fp.close()
        #}}}
    if 0:  #{{{
        try:
            BLOCK_SIZE = 100000
            infile = sys.argv[1]
            fpin = open(infile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                if len(recordList) > 0:
                    for record in recordList:
                        sys.stdout.write(">%s\n" % record[1])
                        sys.stdout.write("%s\n" % record[2])
                if isEOFreached == True:
                    break
            fpin.close()
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        try:
            infile = sys.argv[1]
            (annoList, seqList) = myfunc.ReadFasta_without_id(infile)
            for i in xrange(len(seqList)):
                sys.stdout.write(">%s\n" % annoList[i])
                sys.stdout.write("%s\n" % seqList[i])
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr"
        if IsDuplicatedByHHSearch(hhrfile):
            print "yes"

#}}}
    if 0:  #{{{
        import pairlistwithfamid2pairaln_by_msa
        seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV"
        seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP"
        seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment(
            seq1, seq2)
        print alignFactor
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{
        import my_extractdb
        #miniking my_extractdb.py see which one is faster
        try:
            dbname = sys.argv[1]
            idlistfile = sys.argv[2]
            cls = myfunc.MyDB(dbname)
            if cls.failure:
                print >> sys.stderr, "MyDB init failed"
            else:
                idlist = open(idlistfile, "r").read().split("\n")
                fpout = sys.stdout
                for seqid in idlist:
                    if seqid:
                        record = cls.GetRecord(seqid)
                        fpout.write(record)
            #             for rd in  cls.GetAllRecord():
            #                 print rd
#                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
#                print (seqid, anno, seq)
        except IndexError:
            print "error"
            pass
#}}}
    if 0:  #{{{ #test ReadLineByBlock
        try:
            infile = sys.argv[1]
            from myfunc import ReadLineByBlock
            cls = ReadLineByBlock(infile)
            lines = cls.readlines()
            while lines != None:
                for line in lines:
                    print line
                lines = cls.readlines()

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test speed of ReadLineByBlock
        # ReadLineByBlock is about 3 times fater than file.readline()
        try:
            from myfunc import ReadLineByBlock
            infile = sys.argv[1]

            start = time.time()
            hdl = ReadLineByBlock(infile)
            lines = hdl.readlines()
            while lines != None:
                lines = hdl.readlines()
            hdl.close()
            end = time.time()
            msg = "Reading %s by ReadLineByBlock costs %.3fs seconds"
            print msg % (infile, (end - start))

            start = time.time()
            hdl = open(infile, "r")
            line = hdl.readline()
            while line:
                line = hdl.readline()
            hdl.close()
            end = time.time()
            msg = "Reading %s by readline() costs %.3fs seconds"
            print msg % (infile, (end - start))

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test readline
        try:
            infile = sys.argv[1]
            fp = open(infile, "r")
            line = fp.readline()
            while line:
                print line
                line = fp.readline()
            fp.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test the speed of GetFirstWord
        try:
            nloop = int(sys.argv[1])
            string = "kjdafk jasdfj j"
            #string = "askdf askdf "
            #            string = "kajsdfasdfsdfjakasjdfka"
            #            string = "kajsdfasdf,sdfjakasjdfka"
            delimiter = " \t\r,.\n"
            delimiter = " "
            for i in xrange(nloop):
                #firstword = myfunc.GetFirstWord(string, delimiter)
                #firstword = string.split()[0]
                #firstword = string.partition(" ")[0]
                firstword = myfunc.GetFirstWord(string)
                #pass
                #print firstword
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ # read seq by SeqIO
        from Bio import SeqIO
        try:
            seqfile = sys.argv[1]
            # 1. SeqIO ####################
            start = time.time()
            handle = open(seqfile, "rU")
            cnt = 0
            for record in SeqIO.parse(handle, "fasta"):
                cnt += 1
            handle.close()
            end = time.time()
            msg = "Reading %d sequences by SeqIO costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 2. ReadFasta ####################
            start = time.time()
            seqfile = sys.argv[1]
            (idList, annoList, seqList) = myfunc.ReadFasta(seqfile)
            end = time.time()
            msg = "Reading %d sequences by ReadFasta costs %.3fs seconds"
            print msg % (len(idList), (end - start))

            # 3. ReadFasta from buffer
            BLOCK_SIZE = 100000
            start = time.time()
            cnt = 0
            fpin = open(seqfile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                cnt += len(recordList)
                if isEOFreached == True:
                    break
            fpin.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 4. ReadFastaByBlock ####################
            start = time.time()
            seqfile = sys.argv[1]
            hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0)
            if hdl.failure:
                print >> sys.stderr, "Failed to init ReadFastaByBlock"
                return 1
            recordList = hdl.readseq()
            cnt = 0
            while recordList != None:
                cnt += len(recordList)
                #                 for rd in recordList:
                #                     print ">%s"%rd.description
                #                     print rd.seq
                recordList = hdl.readseq()
            hdl.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds"
            print msg % (cnt, (end - start))
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ #test RemoveUnnecessaryGap
        try:
            infile = sys.argv[1]
            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)
            seqList = lcmp.RemoveUnnecessaryGap_old(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)

            seqList = lcmp.RemoveUnnecessaryGap(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test ReadMPAByBlock
        try:
            infile = sys.argv[1]
            hdl = myfunc.ReadMPAByBlock(infile)
            if hdl.failure:
                return
            recordList = hdl.readseq()
            while recordList != None:
                for rd in recordList:
                    #print rd.seqid
                    print ">%s" % (rd.description)
                    print "%s" % (myfunc.mpa2seq(rd.mpa))
                recordList = hdl.readseq()
            hdl.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{ #test subprocess
        import glob
        #invoke shell explicitly, not very good, may have security problems
        subprocess.call("seq 10", shell=True)
        subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True)
        subprocess.call("ls topo*.py", shell=True)
    if 1:  #{{{ #test subprocess
        import glob
        #invoke shell implicitly, recommended way
        subprocess.call(["seq", "10"], shell=False)
        subprocess.call(["echo", "wait for 1 seconds..."])
        subprocess.call(["sleep", "1"])
        try:
            print subprocess.check_call(["ls",
                                         "topo*.py"])  #This will not work
        except subprocess.CalledProcessError, e:
            print "error message:", e
        subprocess.call(["ls"] + glob.glob("topo*.py"))
Ejemplo n.º 19
0
def WritePairAln(pairlistDict, msapath, msaext, outname):#{{{
    verbose = g_params['verbose']
    outAlnFile = outname + ".pairaln"
    outTableFile = outname + ".tableinfo"
    outSelPairList = outname + ".pairlistwithpfamid"
    try:
        fpout_aln = open(outAlnFile, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file", outAlnFile
        return 1
    try:
        fpout_table = open(outTableFile, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file", outTableFile
        return 1

    try:
        fpout_list = open(outSelPairList, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file", outSelPairList
        return 1

    fpout_table.write("#%-15s %-15s %6s %6s %9s %6s %6s %9s %6s %6s %6s %6s %6s\n" % (
        "Seq1","Seq2", "IDT0", "SIM0", "AlnLength", "Len1","Len2",
        "Score","N_IDT", "N_SIM", "N_GAP", "IDT1", "IDT2"))

    for famid in pairlistDict:
        if verbose >= 2:
            print "Write pairwise alignment for %s"%(famid)
        msafile = msapath + os.sep + famid + msaext
        if not os.path.exists(msafile):
            print >> sys.stderr, "msafile %s does not exist. Ignore" % msafile
            continue
        (idList, annoList, seqList) = myfunc.ReadFasta(msafile)
        msaDict = {}
        annoDict = {}
        for i in xrange(len(idList)):
            msaDict[idList[i]] = seqList[i]
            annoDict[idList[i]] = annoList[i]
        pairlist = pairlistDict[famid]
        #print "pairlist2=", pairlist
        for pair in pairlist:
            #print "pair2 = ", pair
            seq1 = ""
            seq2 = ""
            id1 = pair[0]
            id2 = pair[1]
            if id1 in msaDict and id2 in msaDict:
                seq1 = msaDict[id1] 
                seq2 = msaDict[id2]
                [seq1, seq2] = lcmp.RemoveUnnecessaryGap([seq1, seq2])
                if len(seq1) != len(seq2):
                    print >> sys.stderr, "Bad alignment for %s and %s" %(id1,id2)
                else:
                    rd = pair[2]
                    fpout_aln.write(">%s aligned_to=%s seqIDT=%.1f seqIDT1=%.1f\n"%(
                        annoDict[id1], id2, rd['seqidt0'], rd['seqidt1']))
                    fpout_aln.write("%s\n"%seq1)
                    fpout_aln.write(">%s aligned_to=%s seqIDT=%.1f seqIDT1=%.1f\n"%(
                        annoDict[id2], id1, rd['seqidt0'], rd['seqidt1']))
                    fpout_aln.write("%s\n"%seq2)
                    fpout_table.write("%-16s %-15s %6.1f %6.1f %9d %6d %6d %9.1f %6d %6d %6d %6.1f %6.1f\n"% (
                        id1, id2, rd['seqidt0'], -1.0,
                        rd['alnLength'],
                        rd['seqLength1'], rd['seqLength2'],
                        -1.0,
                        rd['numIDT'], -1, rd['numGap'],
                        rd['seqidt1'], rd['seqidt2']))
                    fpout_list.write("%s %s %s\n"%(id1, id2, famid))
    fpout_aln.close()
    fpout_table.close()
    fpout_list.close()
    print "Result output to "
    print "\t%s"%outAlnFile
    print "\t%s"%outTableFile

    return 0
Ejemplo n.º 20
0
                PrintHelp();
                sys.exit(0);
            elif sys.argv[i] == "-i" or sys.argv[i] == "--infile":
                inFile=sys.argv[i+1];
                i = i + 2;
            elif sys.argv[i] == "-mintm" or sys.argv[i] == "--mintm":
                MIN_NUMTM=int(sys.argv[i+1]);
                i = i + 2;
            elif sys.argv[i] == "-o" or sys.argv[i] == "--out":
                outFile=sys.argv[i+1];
                i = i + 2;
            else:
                print >> sys.stderr,("Error! Wrong argument:%s" % sys.argv[i]);
                sys.exit(1);
        else:
            inFile=sys.argv[i];
            i+=1;
           

    if inFile == "":
        print >> sys.stderr,"Error! Topology file not set.";
        sys.exit(1);


    try :
        (idListTopo,annotationListTopo, topoList) = myfunc.ReadFasta(inFile);
        CleanSingleSpanTMPro(idListTopo, annotationListTopo, topoList);
    except :
        print >>sys.stderr, "except for the input file: %s" % inFile;
        raise ;
Ejemplo n.º 21
0
def DrawPairwiseTopo(pairtopoAlnFile, aaSeqDict, pairCmpclassDict, outpath):
    (idList, annoList, seqList) = myfunc.ReadFasta(pairtopoAlnFile)
    numSeq = len(idList)
    numPair = numSeq / 2

    print "numSeq = ", numSeq
    print "numPair = ", numPair

    for i in range(numPair):
        id1 = idList[2 * i]
        id2 = idList[2 * i + 1]
        if len(seqList[2 * i]) != len(seqList[2 * i + 1]):
            print "Error for %s - %s " % (idList[2 * i], idList[2 * i + 1])
            continue
        basename = "%s-%s" % (id1, id2)

        isSatisfied = True
        #         if basename in pairCmpclassDict:
        #             if g_params['cmpclassList'] != []:
        #                 if (not pairCmpclassDict[basename] in
        #                         g_params['cmpclassList']):
        #                     isSatisfied = False
        #             elif pairCmpclassDict[basename] == 'OK':
        #                 isSatisfied = False
        if isSatisfied:
            outPairAlnFile = outpath + os.sep + "%s.topoaln.fa" % (basename)
            fpout = open(outPairAlnFile, 'w')
            print >> fpout, ">%s" % annoList[2 * i]
            print >> fpout, "%s" % seqList[2 * i]
            print >> fpout, ">%s" % annoList[2 * i + 1]
            print >> fpout, "%s" % seqList[2 * i + 1]
            fpout.close()
            outAASeqFile = outpath + os.sep + "%s.fa" % (basename)
            fpout = open(outAASeqFile, "w")
            if id1 in aaSeqDict:
                print >> fpout, ">%s" % id1
                print >> fpout, "%s" % aaSeqDict[id1]
            if id2 in aaSeqDict:
                print >> fpout, ">%s" % id2
                print >> fpout, "%s" % aaSeqDict[id2]
            fpout.close()

            # Output dgscan file
            dgpfile = outpath + os.sep + basename + '.dgscan'
            cmd = "%s %s -lmin 21 -lmax 21 -o %s" % (dgscanprog, outAASeqFile,
                                                     dgpfile)
            os.system(cmd)

            outpngfile = outpath + os.sep + "%s.topoaln.png" % basename
            outShrinkedFile = (outpath + os.sep +
                               "%s.topoaln.shrinked.png" % basename)
            thumb_outShrinkedFile = (outpath + os.sep + 'thumb.' +
                                     "%s.topoaln.shrinked.png" % basename)
            outNonShrinkedFile = (outpath + os.sep +
                                  "%s.topoaln.nonshrinked.png" % basename)
            thumb_outNonShrinkedFile = (
                outpath + os.sep + 'thumb.' +
                "%s.topoaln.nonshrinked.png" % basename)
            os.system(
                "python %s/drawMSATopo.py %s -pfm no -shrink yes -method mat" %
                (binpath, outPairAlnFile))
            os.system("mv %s %s" % (outpngfile, outShrinkedFile))
            os.system(
                "python %s/drawMSATopo.py %s -pfm no -shrink no -pdg yes -method yes -dgpfile %s"
                % (binpath, outPairAlnFile, dgpfile))
            os.system("mv %s %s" % (outpngfile, outNonShrinkedFile))
            os.system("convert -thumbnail 200 %s %s" %
                      (outShrinkedFile, thumb_outShrinkedFile))
            os.system("convert -thumbnail 200 %s %s" %
                      (outNonShrinkedFile, thumb_outNonShrinkedFile))
            os.system("rm -f %s %s" % (outAASeqFile, dgpfile))
Ejemplo n.º 22
0
def DumpPredictionTOPCONS2(seqfile, path_result, outfile, isWriteDG,
                           isWriteRel):  #{{{
    (seqidlist, seqannolist, seqlist) = myfunc.ReadFasta(seqfile)
    outfile_fa = "%s.fa" % (outfile)
    outfile_unfinished_fa = "%s.unfinished.fa" % (outfile)
    numseq = len(seqidlist)

    fpout = None
    try:
        fpout = open(outfile, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile)
        return 1

    fpout_fa = None
    try:
        fpout_fa = open(outfile_fa, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile_fa)
        return 1

    fpout_unfinished_fa = None
    try:
        fpout_unfinished_fa = open(outfile_unfinished_fa, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (
            outfile_unfinished_fa)
        return 1

    methodlist = [
        'TOPCONS', 'OCTOPUS', 'Philius', 'PolyPhobius', 'SCAMPI', 'SPOCTOPUS',
        'Homology'
    ]

    cntUnFinished = 0
    for iseq in xrange(len(seqidlist)):
        seq = seqlist[iseq]
        length = len(seq)
        desp = seqannolist[iseq]
        if g_params['resultPathFormat'] == "md5":
            md5_key2 = hashlib.md5(seq + "\n").hexdigest()
            md5_key1 = hashlib.md5(seq).hexdigest()
            subdirname = "seq_%d" % (0)
            isFound = False
            for md5_key in [md5_key1, md5_key2]:
                dir1 = md5_key[:2]
                dir2 = md5_key[2:4]
                datapath_this_seq = "%s%s%s%s%s%s%s" % (
                    path_result, os.sep, dir1, os.sep, dir2, os.sep, md5_key)
                subdir = "%s/%s" % (datapath_this_seq, subdirname)
                if os.path.exists(subdir):
                    break
        else:
            subdirname = "seq_%d" % (iseq)
            subdir = "%s/%s" % (path_result, subdirname)

        if g_params['verbose']:
            print "subdir = %s" % (subdir)

        rstfile = "%s/Topcons/topcons.top" % (subdir)
        if os.path.exists(rstfile):
            print >> fpout, "Sequence number: %d" % (iseq + 1)
            print >> fpout, "Sequence name: %s" % (desp)
            print >> fpout, "Sequence length: %d aa." % (length)
            print >> fpout, "Sequence:\n%s\n\n" % (seq)
            topo_consensus = ""
            for i in xrange(len(methodlist)):
                method = methodlist[i]
                seqid = ""
                seqanno = ""
                top = ""
                if method == "TOPCONS":
                    topfile = "%s/%s/topcons.top" % (subdir, "Topcons")
                elif method == "Philius":
                    topfile = "%s/%s/query.top" % (subdir, "philius")
                elif method == "SCAMPI":
                    topfile = "%s/%s/query.top" % (subdir, method + "_MSA")
                else:
                    topfile = "%s/%s/query.top" % (subdir, method)
                if os.path.exists(topfile):
                    (seqid, seqanno, top) = myfunc.ReadSingleFasta(topfile)
                else:
                    top = ""
                if top == "":
                    #top = "***No topology could be produced with this method topfile=%s***"%(topfile)
                    top = "***No topology could be produced with this method***"

                if method == "TOPCONS":
                    topo_consensus = top

                if method == "Homology":
                    showtext_homo = method
                    if seqid != "":
                        showtext_homo = seqid
                    print >> fpout, "%s:\n%s\n\n" % (showtext_homo, top)
                else:
                    print >> fpout, "%s predicted topology:\n%s\n\n" % (method,
                                                                        top)

            if isWriteDG:
                dgfile = "%s/dg.txt" % (subdir)
                dg_content = ""
                if os.path.exists(dgfile):
                    dg_content = myfunc.ReadFile(dgfile)
                lines = dg_content.split("\n")
                dglines = []
                for line in lines:
                    if line and line[0].isdigit():
                        dglines.append(line)
                if len(dglines) > 0:
                    print >> fpout,  "\nPredicted Delta-G-values (kcal/mol) "\
                            "(left column=sequence position; right column=Delta-G)\n"
                    print >> fpout, "\n".join(dglines)

            if isWriteRel:
                reliability_file = "%s/Topcons/reliability.txt" % (subdir)
                reliability = ""
                if os.path.exists(reliability_file):
                    reliability = myfunc.ReadFile(reliability_file)
                if reliability != "":
                    print >> fpout, "\nPredicted TOPCONS reliability (left "\
                            "column=sequence position; right column=reliability)\n"
                    print >> fpout, reliability

            print >> fpout, "##############################################################################"

            # write the concensus prediction in FASTA format
            print >> fpout_fa, ">%s" % (desp)
            print >> fpout_fa, topo_consensus

        else:
            # write unfinished
            fpout_unfinished_fa.write(">%s\n%s\n" % (desp, seq))
            cntUnFinished += 1

    if cntUnFinished > 1:
        print >> sys.stderr, "%s out of %d sequences are with unfinished predictions, please check." % (
            cntUnFinished, numseq)

    for fp in [fpout, fpout_fa, fpout_unfinished_fa]:
        if fp:
            try:
                fp.close()
            except IOError:
                pass

    return 0
Ejemplo n.º 23
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = "./"
    idListFile = None
    idList = []
    seqfile = ""
    topofile = ""
    max_dist = 12  # maximal distance to the TM helix so that K, R residues are counted
    flank_win = 5  # flanking window of the TM helix, residues at position
    #TMbeg-flank_win and TMend+flank_win are also counted

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            idList.append(argv[i])
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outpath", "--outpath"]:
                (outpath, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-maxdist", "--maxdist"]:
                (max_dist, i) = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-flankwin", "--flankwin"]:
                (flank_win, i) = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-seqfile", "--seqfile"]:
                (seqfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-topofile", "--topofile"]:
                (topofile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-l", "--l"]:
                (idListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            elif argv[i] in ["-debug"]:
                g_params['isDEBUG'] = True
                i += 1
            else:
                print("Error! Wrong argument:", argv[i], file=sys.stderr)
                return 1
        else:
            idList.append(argv[i])
            i += 1
    (idListSeq, annoListSeq, seqList) = myfunc.ReadFasta(seqfile)
    (idListTopo, annoListTopo, topoList) = myfunc.ReadFasta(topofile)

    numSeq = len(idListSeq)
    numTopo = len(idListTopo)
    if numSeq < 1 or numTopo < 1:
        print("No seq set", file=sys.stderr)
        return 1
    seqDict = {}
    for i in range(numSeq):
        seqDict[idListSeq[i]] = seqList[i]
    topoDict = {}
    for i in range(numTopo):
        topoDict[idListTopo[i]] = topoList[i]

    cmpclassDict = {}
    for anno in annoListTopo:
        anno = anno.lstrip(">")
        strs = anno.split()
        cmpclassDict[strs[0]] = strs[1]

    outpath = os.path.dirname(seqfile)
    if outpath == "":
        outpath = "."
    rootname = os.path.basename(os.path.splitext(seqfile)[0])
    outfile_kr_list = outpath + os.sep + rootname + ".krlist.txt"
    outfile_krbias = outpath + os.sep + rootname + ".krbias.txt"

    fpout_krlist = open(outfile_kr_list, "w")
    fpout_krbias = open(outfile_krbias, "w")

    for idd in idListSeq:
        if g_params['isDEBUG']:
            print("seqid: %s" % (idd))
        try:
            topo = topoDict[idd]
        except KeyError:
            print("no topo for %s" % idd, file=sys.stderr)
            continue
        try:
            seq = seqDict[idd]
        except KeyError:
            print("no seq for %s" % idd, file=sys.stderr)
            continue
        try:
            cmpclass = cmpclassDict[idd]
        except KeyError:
            cmpclass = "INV"
        (kr_bias, KR_pos_list, numTM) = CalKRBias(seq, topo, flank_win,
                                                  max_dist)
        WriteResult(idd, cmpclass, seq, numTM, kr_bias, KR_pos_list,
                    fpout_krlist)
        if cmpclass in ["IDT", "INV"]:
            fpout_krbias.write("%d\n" % kr_bias)
    fpout_krlist.close()
    fpout_krbias.close()
Ejemplo n.º 24
0
def main():#{{{
    numArgv = len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    global isPrintSeqID
    outFile=""
    inFile=""
    fastaFile=""

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            isNonOptionArg=False
            i = i + 1
        elif sys.argv[i] == "--":
            isNonOptionArg=True
            i = i + 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] ==  "-h" or  sys.argv[i] == "--help":
                PrintHelp()
                return 1
            elif sys.argv[i] == "-i" or sys.argv[i] == "--infile":
                inFile=sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] == "-f" or sys.argv[i] == "--fasta":
                fastaFile=sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] == "-printid" or sys.argv[i] == "--printid":
                if (sys.argv[i+1].lower())[0] == "y": 
                    isPrintSeqID=True
                else:
                    isPrintSeqID=False
                i = i + 2
            elif sys.argv[i] == "-o" or sys.argv[i] == "--outfile":
                outFile=sys.argv[i+1]
                i = i + 2
            else:
                print >> sys.stderr,("Error! Wrong argument:%s" % sys.argv[i])
                return 1
        else:
            inFile=sys.argv[i]
            i+=1

    if inFile == "":
        print >> sys.stderr,"Error! Topology file not set."
        return 1
    if fastaFile == "":
        print >> sys.stderr,"Error!  amino acid fasta file not set."
        return 1

    fpout = sys.stdout
    if outFile != "":
        fpout = open(outFile,"w")
        if not fpout:
            print >> sys.stderr, "Failed to write to outfile %s. "%(outFile)
            print >> sys.stderr, "Reset output to stdout."
            fpout = sys.stdout
    sizeAASeqFile = os.path.getsize(fastaFile)

    if sizeAASeqFile > MAX_FASTA_AA_FILE_SIZE:
        print >> sys.stderr, ("size (%d)"%sizeAASeqFile 
                + " of fasta sequence file (%s)"%fastaFile
                + " is over the limit (%d). Exit."% MAX_FASTA_AA_FILE_SIZE)
        return 1

    (idListSeq, annotationListSeq, seqList) = myfunc.ReadFasta(fastaFile)
    if idListSeq == None:
        print >> sys.stderr, "%s exit with error."%sys.argv[0]
        return 1
    elif idListSeq < 1:
        print >> sys.stderr, ("Warning! zero aa sequences have" 
                + " been read in for file %s" %fastaFile)
    aaSeqDict={}
    for i in xrange (len(idListSeq)):
        aaSeqDict[idListSeq[i]] = seqList[i]


    fpin = open (inFile, "rb")
    if not fpin:
        print >> sys.stderr, "Failed to open input file %s"%(inFile)
        return -1
    unprocessedBuffer=""
    isEOFreached = False
    processedTopoIDSet = set([])
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if len(buff) < BLOCK_SIZE:
            isEOFreached=True
        buff = unprocessedBuffer + buff
        recordList = []
        unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff,recordList, isEOFreached)
        if len(recordList) > 0: 
            idListTopo = [r[0] for r in recordList]
            topoList = [r[2] for r in recordList]
            Topo2TMFrag(idListTopo, topoList,aaSeqDict, processedTopoIDSet, fpout)
        if isEOFreached == True:
            break
    fpin.close()

    if fpout != None and fpout != sys.stdout:
        fpout.close()
Ejemplo n.º 25
0
def MakeTMplot(seqAlnFile, topAlnFile, outpath, tmpdir):# {{{
    """Make topology plot for TM family.
    """
    rootname = os.path.basename(os.path.splitext(seqAlnFile)[0])
    basename_seqAlnFile = os.path.basename(seqAlnFile)
    basename_topAlnFile = os.path.basename(topAlnFile)
    ext_topAlnFile = os.path.splitext(topAlnFile)[1].lstrip('.')

    shutil.copy2(seqAlnFile, os.path.join(tmpdir, basename_seqAlnFile))
    shutil.copy2(topAlnFile, os.path.join(tmpdir, basename_topAlnFile))
    cwd = os.getcwd()

    os.chdir(tmpdir)
    # generate topology one line plot
    cmd = [python_exec, os.path.join(rundir, "drawMSATopo.py"), "-m-shrink",
        str(0), "-method", "pil",  "-pfm", "no", "-text", "n",  "-pdg", "n",
        "-pfm", "n",  "-pmsa", "y", "-ptag", "y", "-showTMidx", "-sep", "n",
        "--advtopo",   "-cleanplot", "-h2wratio", str(g_params["H2W_ratio"]),
        "-shrink", "no", "-showgap", basename_topAlnFile]

    if g_params['verbose']:
        print(("Generating toplogy alignment figure for %s"%(rootname)))

    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1
    topalnfigure = "%s.png"%(rootname)
    if not os.path.exists(topalnfigure):
        return 1
    # resize the figure file
    resized_topalnfigure = "%s.s%d.png"%(rootname, g_params['figure_resize'])
    shutil.copy2(topalnfigure, resized_topalnfigure)
    cmd = ["mogrify", "-resize", str(g_params['figure_resize']), resized_topalnfigure]
    if g_params['verbose']:
        print(("Resizing the topology alignment figure for %s"%(rootname)))
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1

    # generate seqaln figure
    seqaln_htmlfigure = "%s.%s"%(rootname, "seqaln.html")
    cmd = [python_exec, os.path.join(rundir, "write_seqaln_colorTM.py"),
            basename_seqAlnFile, "-ext-topomsa", ext_topAlnFile, "-ws",
            str(g_params['window_size']), "-o",
            seqaln_htmlfigure, "-cleanplot", "-rmgap"]
    if g_params['isBreakTM']:
        cmd += ["-breakTM"]

    if g_params['verbose']:
        print(("Generating sequence alignment highlighted by TM regions for %s"%(rootname)))
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1

    # convert html to pdf
    seqaln_pdffigure = "%s.%s"%(rootname, "seqaln.pdf")
    cmd = ["wkhtmltopdf",  seqaln_htmlfigure, seqaln_pdffigure]
    if os_dist.lower() in ["debian", "ubuntu"]:
        cmd = ["xvfb-run"] + cmd
    if g_params['verbose']:
        print("Convert the html figure to PDF for sequence alignment")
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1
    # crop the PDF figure
    cmd = ["pdfcrop", seqaln_pdffigure]
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1

    seqaln_pdffigure_crop =  "%s.%s"%(rootname, "seqaln-crop.pdf")

    # merge figures
    (seqIDList, seqAnnoList, seqList) = myfunc.ReadFasta(basename_seqAlnFile)
    str_evalue = ""
    if len(seqAnnoList) > 0:
        str_evalue = seqAnnoList[0].split('/')[-1]

    outfile = "%s.seqtopaln.pdf"%(rootname)
    cmd = ["bash", os.path.join(rundir, "merge_tmplot.sh"),
            resized_topalnfigure, seqaln_pdffigure_crop, "-cap", 
            "%s"%(rootname), "-o", outfile]
    capList = []
    for i in range(len(seqIDList)):
        capList += ["-cap", "%s: %s"%(alphabet[i], seqIDList[i])]
    cmd += capList
    if g_params['verbose']:
        print(("Merging the topology alignment figure and sequence alignment figure for %s"%(rootname)))
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1

    # copy the pdf figure generated by latex to a tmp file (a hack for the
    # PDFcrop
    tmpoutfile = "tt1.pdf"
    shutil.copy2(outfile, tmpoutfile)

    # crop the merged PDF figure
    cmd = ["pdfcrop", tmpoutfile]
    (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd)
    if not isCmdSuccess:
        print(t_msg)
        return 1

    outfile_crop =  "tt1-crop.pdf"

    if os.path.exists(outfile_crop):
        final_targetfile =  os.path.join(outpath, "%s.seqtopaln.pdf"%(rootname))
        shutil.copy2(outfile_crop, final_targetfile)

    if g_params['verbose']:
        print(("Copy the result to final target %s"%(os.path.join(outpath, outfile))))


    os.chdir(cwd)

    return 0
Ejemplo n.º 26
0
        sys.exit(1)
    if topoWithDGScoreFile == "" and dgscanFile == "":
        print >> sys.stderr, "Error! Either topoWithDGScoreFile or dgscanFile should be set."
        sys.exit(1)
    if topoWithDGScoreFile != "" and dgscanFile != "":
        print >> sys.stderr, "Error! Only one of the topoWithDGScoreFile and dgscanFile can be set."
        sys.exit(1)

    fpout = sys.stdout
    if outFile != "":
        fpout = open(outFile, "w")

    try:
        gapopenList = []
        topoWithDGScoreList = []
        (idListSeq, annotationListSeq, seqList) = myfunc.ReadFasta(fastaFile)
        if topoWithDGScoreFile != "":
            (topoWithDGScoreList,
             indexID) = ReadTopoWithDGScore(topoWithDGScoreFile)
            gapopenList = GetGapOpenValues(topoWithDGScoreList)
            if not (len(gapopenList) == len(idListSeq)
                    and len(gapopenList) == len(topoWithDGScoreList)):
                print >> sys.stderr, "length mismatch"
                print >> sys.stderr, "len(gapopenList)=", len(gapopenList)
                print >> sys.stderr, "len(idListSeq)=", len(idListSeq)
                print >> sys.stderr, "len(topoWithDGScoreList)=", len(
                    topoWithDGScoreList)
                sys.exit(1)
        elif dgscanFile != "":
            (dgscanList, indexID) = ReadDGScan(dgscanFile)
            gapopenList = GetGapOpenValuesFromDGScan(dgscanList)
def RunJob(infile, outpath, tmpdir, email, jobid, g_params):  #{{{
    all_begin_time = time.time()

    rootname = os.path.basename(os.path.splitext(infile)[0])
    starttagfile = "%s/runjob.start" % (outpath)
    runjob_errfile = "%s/runjob.err" % (outpath)
    runjob_logfile = "%s/runjob.log" % (outpath)
    finishtagfile = "%s/runjob.finish" % (outpath)
    rmsg = ""

    resultpathname = jobid

    outpath_result = "%s/%s" % (outpath, resultpathname)
    tarball = "%s.tar.gz" % (resultpathname)
    zipfile = "%s.zip" % (resultpathname)
    tarball_fullpath = "%s.tar.gz" % (outpath_result)
    zipfile_fullpath = "%s.zip" % (outpath_result)
    outfile = "%s/%s/Topcons/topcons.top" % (outpath_result, "seq_%d" % (0))
    resultfile_text = "%s/%s" % (outpath_result, "query.result.txt")
    mapfile = "%s/seqid_index_map.txt" % (outpath_result)
    finished_seq_file = "%s/finished_seqs.txt" % (outpath_result)

    tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname)
    isOK = True
    try:
        os.makedirs(tmp_outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (tmp_outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    try:
        os.makedirs(outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    if isOK:
        try:
            open(finished_seq_file, 'w').close()
        except:
            pass
#first getting result from caches
# ==================================

        maplist = []
        maplist_simple = []
        toRunDict = {}
        hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0)
        if hdl.failure:
            isOK = False
        else:
            datetime = time.strftime("%Y-%m-%d %H:%M:%S")
            rt_msg = myfunc.WriteFile(datetime, starttagfile)

            recordList = hdl.readseq()
            cnt = 0
            origpath = os.getcwd()
            while recordList != None:
                for rd in recordList:
                    isSkip = False
                    # temp outpath for the sequence is always seq_0, and I feed
                    # only one seq a time to the workflow
                    tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result,
                                                      "seq_%d" % 0)
                    outpath_this_seq = "%s/%s" % (outpath_result,
                                                  "seq_%d" % cnt)
                    subfoldername_this_seq = "seq_%d" % (cnt)
                    if os.path.exists(tmp_outpath_this_seq):
                        try:
                            shutil.rmtree(tmp_outpath_this_seq)
                        except OSError:
                            pass

                    maplist.append(
                        "%s\t%d\t%s\t%s" %
                        ("seq_%d" % cnt, len(rd.seq), rd.description, rd.seq))
                    maplist_simple.append(
                        "%s\t%d\t%s" %
                        ("seq_%d" % cnt, len(rd.seq), rd.description))
                    if not g_params['isForceRun']:
                        md5_key = hashlib.md5(rd.seq).hexdigest()
                        subfoldername = md5_key[:2]
                        md5_link = "%s/%s/%s" % (path_md5cache, subfoldername,
                                                 md5_key)
                        if os.path.exists(md5_link):
                            # create a symlink to the cache
                            rela_path = os.path.relpath(
                                md5_link, outpath_result)  #relative path
                            os.chdir(outpath_result)
                            os.symlink(rela_path, subfoldername_this_seq)

                            if os.path.exists(outpath_this_seq):
                                runtime = 0.0  #in seconds
                                topfile = "%s/%s/topcons.top" % (
                                    outpath_this_seq, "Topcons")
                                top = myfunc.ReadFile(topfile).strip()
                                numTM = myfunc.CountTM(top)
                                posSP = myfunc.GetSPPosition(top)
                                if len(posSP) > 0:
                                    isHasSP = True
                                else:
                                    isHasSP = False
                                info_finish = [
                                    "seq_%d" % cnt,
                                    str(len(rd.seq)),
                                    str(numTM),
                                    str(isHasSP), "cached",
                                    str(runtime), rd.description
                                ]
                                myfunc.WriteFile("\t".join(info_finish) + "\n",
                                                 finished_seq_file,
                                                 "a",
                                                 isFlush=True)
                                isSkip = True

                    if not isSkip:
                        # first try to delete the outfolder if exists
                        if os.path.exists(outpath_this_seq):
                            try:
                                shutil.rmtree(outpath_this_seq)
                            except OSError:
                                pass
                        origIndex = cnt
                        numTM = 0
                        toRunDict[origIndex] = [rd.seq, numTM, rd.description
                                                ]  #init value for numTM is 0

                    cnt += 1
                recordList = hdl.readseq()
            hdl.close()
        myfunc.WriteFile("\n".join(maplist_simple) + "\n", mapfile)

        # run scampi single to estimate the number of TM helices and then run
        # the query sequences in the descending order of numTM
        torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa")
        dumplist = []
        for key in toRunDict:
            top = toRunDict[key][0]
            dumplist.append(">%s\n%s" % (str(key), top))
        myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w")
        del dumplist

        topfile_scampiseq = "%s/%s" % (tmp_outpath_result,
                                       "query.torun.fa.topo")
        if os.path.exists(torun_all_seqfile):
            # run scampi to estimate the number of TM helices
            cmd = [
                script_scampi, torun_all_seqfile, "-outpath",
                tmp_outpath_result
            ]
            try:
                rmsg = subprocess.check_output(cmd)
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e) + "\n")
                pass
        if os.path.exists(topfile_scampiseq):
            (idlist_scampi, annolist_scampi,
             toplist_scampi) = myfunc.ReadFasta(topfile_scampiseq)
            for jj in xrange(len(idlist_scampi)):
                numTM = myfunc.CountTM(toplist_scampi[jj])
                try:
                    toRunDict[int(idlist_scampi[jj])][1] = numTM
                except (KeyError, ValueError, TypeError):
                    pass

        sortedlist = sorted(toRunDict.items(),
                            key=lambda x: x[1][1],
                            reverse=True)
        #format of sortedlist [(origIndex: [seq, numTM, description]), ...]

        # submit sequences one by one to the workflow according to orders in
        # sortedlist

        for item in sortedlist:
            #             g_params['runjob_log'].append("tmpdir = %s"%(tmpdir))
            #cmd = [script_getseqlen, infile, "-o", tmp_outfile , "-printid"]
            origIndex = item[0]
            seq = item[1][0]
            description = item[1][2]

            outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % origIndex)
            tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" %
                                              (0))
            if os.path.exists(tmp_outpath_this_seq):
                try:
                    shutil.rmtree(tmp_outpath_this_seq)
                except OSError:
                    pass

            seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" %
                                          (origIndex))
            seqcontent = ">%d\n%s\n" % (origIndex, seq)
            myfunc.WriteFile(seqcontent, seqfile_this_seq, "w")

            if not os.path.exists(seqfile_this_seq):
                g_params['runjob_err'].append(
                    "failed to generate seq index %d" % (origIndex))
                continue

            cmd = [
                runscript, seqfile_this_seq, tmp_outpath_result, blastdir,
                blastdb
            ]
            g_params['runjob_log'].append(" ".join(cmd))
            begin_time = time.time()
            try:
                rmsg = subprocess.check_output(cmd)
                g_params['runjob_log'].append("workflow:\n" + rmsg + "\n")
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e) + "\n")
                g_params['runjob_err'].append(rmsg + "\n")
                pass
                #suqoutfilelist = glob.glob("%s/*.sh.*.out"%(tmpdir))
                #if len(suqoutfilelist)>0:
                #    suqoutfile = suqoutfilelist[0]
                #g_params['runjob_err'].append(myfunc.ReadFile(suqoutfile))
            end_time = time.time()
            runtime_in_sec = end_time - begin_time

            if os.path.exists(tmp_outpath_this_seq):
                cmd = ["mv", "-f", tmp_outpath_this_seq, outpath_this_seq]
                isCmdSuccess = False
                try:
                    subprocess.check_output(cmd)
                    isCmdSuccess = True
                except subprocess.CalledProcessError, e:
                    msg = "Failed to run prediction for sequence No. %d\n" % (
                        origIndex)
                    g_params['runjob_err'].append(msg)
                    g_params['runjob_err'].append(str(e) + "\n")
                    pass
                timefile = "%s/time.txt" % (tmp_outpath_result)
                targetfile = "%s/time.txt" % (outpath_this_seq)
                if os.path.exists(timefile) and os.path.exists(
                        outpath_this_seq):
                    try:
                        shutil.move(timefile, targetfile)
                    except:
                        g_params['runjob_err'].append(
                            "Failed to move %s/time.txt" %
                            (tmp_outpath_result) + "\n")
                        pass

                if isCmdSuccess:
                    runtime = runtime_in_sec  #in seconds
                    topfile = "%s/%s/topcons.top" % (outpath_this_seq,
                                                     "Topcons")
                    top = myfunc.ReadFile(topfile).strip()
                    numTM = myfunc.CountTM(top)
                    posSP = myfunc.GetSPPosition(top)
                    if len(posSP) > 0:
                        isHasSP = True
                    else:
                        isHasSP = False
                    info_finish = [
                        "seq_%d" % origIndex,
                        str(len(seq)),
                        str(numTM),
                        str(isHasSP), "newrun",
                        str(runtime), description
                    ]
                    myfunc.WriteFile("\t".join(info_finish) + "\n",
                                     finished_seq_file,
                                     "a",
                                     isFlush=True)
                    # now write the text output for this seq

                    info_this_seq = "%s\t%d\t%s\t%s" % (
                        "seq_%d" % origIndex, len(seq), description, seq)
                    resultfile_text_this_seq = "%s/%s" % (outpath_this_seq,
                                                          "query.result.txt")
                    myfunc.WriteTOPCONSTextResultFile(resultfile_text_this_seq,
                                                      outpath_result,
                                                      [info_this_seq],
                                                      runtime_in_sec,
                                                      g_params['base_www_url'])
                    # create or update the md5 cache
                    # create cache only on the front-end
                    if g_params['base_www_url'].find("topcons.net") != -1:
                        md5_key = hashlib.md5(seq).hexdigest()
                        subfoldername = md5_key[:2]
                        md5_subfolder = "%s/%s" % (path_md5cache,
                                                   subfoldername)
                        md5_link = "%s/%s/%s" % (path_md5cache, subfoldername,
                                                 md5_key)
                        if os.path.exists(md5_link):
                            try:
                                os.unlink(md5_link)
                            except:
                                pass
                        subfolder_md5 = "%s/%s" % (path_md5cache,
                                                   subfoldername)
                        if not os.path.exists(subfolder_md5):
                            try:
                                os.makedirs(subfolder_md5)
                            except:
                                pass

                        rela_path = os.path.relpath(
                            outpath_this_seq, md5_subfolder)  #relative path
                        try:
                            os.chdir(md5_subfolder)
                            os.symlink(rela_path, md5_key)
                        except:
                            pass
Ejemplo n.º 28
0
def main(g_params):

    numArgv=len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outFile = ""
    orderlistfile = ""
    msafile = ""
    outformat = "fasta" # fasta or anno

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            msafile = sys.argv[i]
            isNonOptionArg=False
            i = i + 1
        elif sys.argv[i] == "--":
            isNonOptionArg=True
            i = i + 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] ==  "-h" or  sys.argv[i] == "--help":
                PrintHelp()
                return 1
            elif sys.argv[i] in [ "-o", "--o"] :
                outFile=sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] == "-orderlist" or sys.argv[i] == "--orderlist":
                orderlistfile = sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] == "-msafile" or sys.argv[i] == "--msafile":
                msafile = sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] in ["-of", "--of", "-outformat", "--outformat"]:
                outformat = sys.argv[i+1].lower()
                i += 2
            else:
                print(("Error! Wrong argument:%s" % sys.argv[i]), file=sys.stderr)
                return 1
        else:
            msafile = sys.argv[i]
            i+=1

    if not outformat in ["anno", "fasta"]:
        print("Unrecognized outformat \"%s\","%(
                outformat) + " should be either \"anno\" or \"fasta\".", file=sys.stderr)
        return 1

    if orderlistfile == "":
        print("orderlist file not set. Exit", file=sys.stderr)
        return 1
    if msafile == "":
        print("msafile not set. Exit", file=sys.stderr)
    orderList = ReadOrderList(orderlistfile)  
    (idList, annoList, seqList) = myfunc.ReadFasta(msafile)

    if len(orderList) > 0  and len(idList) > 0:
        fpout = sys.stdout
        fpout = myfunc.myopen(outFile, sys.stdout, "w", False)

        seqDict = {}
        annoDict = {}
        numSeq = len(idList)
        for i in range(numSeq):
            annoDict[idList[i]] = annoList[i]
        if outformat != "anno":
            for i in range(numSeq):
                seqDict[idList[i]] = seqList[i]
        for sid in orderList:
            if sid in annoDict:
                fpout.write(">%s\n"%annoDict[sid])
                if outformat != "anno":
                    fpout.write("%s\n"%seqDict[sid])
            else:
                print("seqid %s not in msafile %s"%(
                        sid, msafile), file=sys.stderr)
        myfunc.myclose(fpout)

    return 0
Ejemplo n.º 29
0
def DumpPredictionTOPCONS2(seqfile, path_result, outfile, isWriteDG,
                           isWriteRel):  #{{{
    (seqidlist, seqannolist, seqlist) = myfunc.ReadFasta(seqfile)
    outfile_fa = "%s.fa" % (outfile)

    fpout = None
    try:
        fpout = open(outfile, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile)
        return 1

    fpout_fa = None
    try:
        fpout_fa = open(outfile_fa, "w")
    except IOError:
        print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile_fa)
        return 1

    methodlist = [
        'TOPCONS', 'OCTOPUS', 'Philius', 'PolyPhobius', 'SCAMPI', 'SPOCTOPUS',
        'Homology'
    ]

    for i in xrange(len(seqidlist)):
        subdirname = "seq_%d" % (i)
        subdir = "%s/%s" % (path_result, subdirname)
        seq = seqlist[i]
        length = len(seq)
        desp = seqannolist[i]
        print >> fpout, "Sequence number: %d" % (i + 1)
        print >> fpout, "Sequence name: %s" % (desp)
        print >> fpout, "Sequence length: %d aa." % (length)
        print >> fpout, "Sequence:\n%s\n\n" % (seq)
        topo_consensus = ""
        for i in xrange(len(methodlist)):
            method = methodlist[i]
            seqid = ""
            seqanno = ""
            top = ""
            if method == "TOPCONS":
                topfile = "%s/%s/topcons.top" % (subdir, "Topcons")
            elif method == "Philius":
                topfile = "%s/%s/query.top" % (subdir, "philius")
            elif method == "SCAMPI":
                topfile = "%s/%s/query.top" % (subdir, method + "_MSA")
            else:
                topfile = "%s/%s/query.top" % (subdir, method)
            if os.path.exists(topfile):
                (seqid, seqanno, top) = myfunc.ReadSingleFasta(topfile)
            else:
                top = ""
            if top == "":
                #top = "***No topology could be produced with this method topfile=%s***"%(topfile)
                top = "***No topology could be produced with this method***"

            if method == "TOPCONS":
                topo_consensus = top

            if method == "Homology":
                showtext_homo = method
                if seqid != "":
                    showtext_homo = seqid
                print >> fpout, "%s:\n%s\n\n" % (showtext_homo, top)
            else:
                print >> fpout, "%s predicted topology:\n%s\n\n" % (method,
                                                                    top)

        if isWriteDG:
            dgfile = "%s/dg.txt" % (subdir)
            dg_content = ""
            if os.path.exists(dgfile):
                dg_content = myfunc.ReadFile(dgfile)
            lines = dg_content.split("\n")
            dglines = []
            for line in lines:
                if line and line[0].isdigit():
                    dglines.append(line)
            if len(dglines) > 0:
                print >> fpout,  "\nPredicted Delta-G-values (kcal/mol) "\
                        "(left column=sequence position; right column=Delta-G)\n"
                print >> fpout, "\n".join(dglines)

        if isWriteRel:
            reliability_file = "%s/Topcons/reliability.txt" % (subdir)
            reliability = ""
            if os.path.exists(reliability_file):
                reliability = myfunc.ReadFile(reliability_file)
            if reliability != "":
                print >> fpout, "\nPredicted TOPCONS reliability (left "\
                        "column=sequence position; right column=reliability)\n"
                print >> fpout, reliability

        print >> fpout, "##############################################################################"

        # write the concensus prediction in FASTA format
        print >> fpout_fa, ">%s" % (desp)
        print >> fpout_fa, topo_consensus

    if fpout:
        try:
            fpout.close()
        except IOError:
            pass
    if fpout_fa:
        try:
            fpout_fa.close()
        except IOError:
            pass

    return 0