コード例 #1
0
def SelectPairaln(pairalnFile, pairlistSet, fpout):  #{{{
    try:
        numPairToSelect = len(pairlistSet)
        cntSelectedPair = 0
        fpin = open(pairalnFile, "r")
        unprocessedBuffer = ""
        isEOFreached = False
        processedTopoIDSet = set([])
        remainedRd = None
        while 1:
            buff = fpin.read(BLOCK_SIZE)
            if len(buff) < BLOCK_SIZE:
                isEOFreached = True
            buff = unprocessedBuffer + buff
            recordList = []
            if remainedRd != None:
                recordList.append(remainedRd)
            unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                buff, recordList, isEOFreached)
            numRecord = len(recordList)
            if numRecord > 0:
                cntSelectedPair += SelectRecord(recordList, pairlistSet, fpout)
                if cntSelectedPair >= numPairToSelect:
                    break
                if numRecord % 2 == 1:
                    remainedRd = recordList[numRecord - 1]
                else:
                    remainedRd = None

            if isEOFreached == True:
                break
        fpin.close()
    except IOError:
        print >> sys.stderr, "Failed to open file %s for read" % (pairalnFile)
        return 1
コード例 #2
0
def FilterPairalnByKeyword(infile, seqid2AnnoDict, fpout):  #{{{
    try:
        fpin = open(infile, "r")
        unprocessedBuffer = ""
        isEOFreached = False
        processedTopoIDSet = set([])
        remainedRd = None
        while 1:
            buff = fpin.read(BLOCK_SIZE)
            if len(buff) < BLOCK_SIZE:
                isEOFreached = True
            buff = unprocessedBuffer + buff
            recordList = []
            if remainedRd != None:
                recordList.append(remainedRd)
            unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                buff, recordList, isEOFreached)
            numRecord = len(recordList)
            if numRecord > 0:
                FilterRecord(recordList, seqid2AnnoDict, fpout)
                if numRecord % 2 == 1:
                    remainedRd = recordList[numRecord - 1]
                else:
                    remainedRd = None

            if isEOFreached == True:
                break
        fpin.close()
    except IOError:
        print >> sys.stderr, "Failed to open file %s for read" % (infile)
        return 1
コード例 #3
0
def MSA2Seq_fasta(infile, outfile):  #{{{
    try:
        fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

        fpin = open(infile, "rb")
        unprocessedBuffer = ""
        isEOFreached = False
        while 1:
            buff = fpin.read(BLOCK_SIZE)
            if len(buff) < BLOCK_SIZE:
                isEOFreached = True
            buff = unprocessedBuffer + buff
            recordList = []
            unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                buff, recordList, isEOFreached)
            for rd in recordList:
                anno = rd[1]
                seq = rd[2].replace("-", "").replace(".", "").replace(" ", "")
                fpout.write(">%s\n" % anno)
                fpout.write("%s\n" % seq)
            if isEOFreached == True:
                break
        fpin.close()
        myfunc.myclose(fpout)
    except IOError:
        print >> sys.stderr, "Failed to read file", infile
        return 1
コード例 #4
0
ファイル: test.py プロジェクト: ElofssonLab/web_proq3
rundir = os.path.dirname(__file__)
basedir = os.path.realpath("%s/../" % (rundir))

if 0:  #{{{
    infile = sys.argv[1]
    li = myfunc.ReadIDList2(infile, 2, None)
    print(li)
#}}}
if 0:  #{{{
    rawseq = ">1\nseqAAAAAAAAAAAAAAAAAAAAAAAAA\n    \n>2  dad\ndfasdf  "
    #rawseq = "  >1\nskdfaskldgasdk\nf\ndadfa\n\n\nadsad   \n"
    #rawseq = ">sadjfasdkjfsalkdfsadfjasdfk"
    rawseq = "asdkfjasdg asdkfasdf\n"
    seqRecordList = []
    myfunc.ReadFastaFromBuffer(rawseq, seqRecordList, True, 0, 0)

    print(seqRecordList)
#}}}

if 0:  #{{{
    size = float(sys.argv[1])
    print("size=", size)
    print("humansize=", myfunc.Size_byte2human(size))  #}}}

if 0:  # {{{
    newsfile = "%s/static/doc/news.txt" % (basedir)
    newsList = myfunc.ReadNews(newsfile)
    print(newsList)
# }}}
コード例 #5
0
def main(g_params):#{{{
    numArgv = len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile  = ""
    infile  = ""
    mapfile = ""

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = sys.argv[i+1]
            isNonOptionArg=False
            i = i + 1
        elif sys.argv[i] == "--":
            isNonOptionArg=True
            i = i + 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] ==  "-h" or  sys.argv[i] == "--help":
                PrintHelp()
                return 1
            elif sys.argv[i] in [ "-i", "--i", "--infile"]:
                infile = sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] in [ "-o", "--o", "-outfile", "--outfile"]:
                outfile =sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] in [ "-map", "--map"]:
                mapfile =sys.argv[i+1]
                i = i + 2
            else:
                print >> sys.stderr,("Error! Wrong argument:%s" % sys.argv[i])
                return 1
        else:
            infile = sys.argv[i]
            i+=1

    if infile == "":
        print >> sys.stderr,"Error! Topology file not set."
        return 1


    isMapSupplied = False
    mapDict = {}
    if mapfile != "" and os.path.exists(mapfile):
        mapDict = ReadMapFile(mapfile)
        isMapSupplied = True

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    try: 
        fpin = open (infile, "rb")
        unprocessedBuffer=""
        isEOFreached = False
        processedTopoIDSet = set([])
        while 1:
            buff = fpin.read(BLOCK_SIZE)
            if len(buff) < BLOCK_SIZE:
                isEOFreached=True
            buff = unprocessedBuffer + buff
            recordList = []
            unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff,recordList, isEOFreached)
            if len(recordList) > 0: 
                for rd in recordList:
# if not isMapSupplied use seqid as the first word
# else, using mapped id, and if keyerror, use the original annotation line
                    if not isMapSupplied:
                        renamedID = rd[0]
                        fpout.write(">%s %s\n"%(renamedID, rd[1]))
                    else:
                        try:
                            renamedID = mapDict[rd[0]]
                            fpout.write(">%s %s\n"%(renamedID, rd[1]))
                        except KeyError:
                            msg = "ID %s not found in mapDict"
                            print >> sys.stderr, msg%(rd[0])
                            fpout.write(">%s\n"%(rd[1]))
                    fpout.write("%s\n"%(rd[2]))
            if isEOFreached == True:
                break
        fpin.close()
    except IOError:
        print >> sys.stderr, "Failed to read input file %s"%(infile)
        return 1

    myfunc.myclose(fpout)
コード例 #6
0
def main():#{{{
    numArgv = len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    global isPrintSeqID
    outFile=""
    inFile=""
    fastaFile=""

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            isNonOptionArg=False
            i = i + 1
        elif sys.argv[i] == "--":
            isNonOptionArg=True
            i = i + 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] ==  "-h" or  sys.argv[i] == "--help":
                PrintHelp()
                return 1
            elif sys.argv[i] == "-i" or sys.argv[i] == "--infile":
                inFile=sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] == "-f" or sys.argv[i] == "--fasta":
                fastaFile=sys.argv[i+1]
                i = i + 2
            elif sys.argv[i] == "-printid" or sys.argv[i] == "--printid":
                if (sys.argv[i+1].lower())[0] == "y": 
                    isPrintSeqID=True
                else:
                    isPrintSeqID=False
                i = i + 2
            elif sys.argv[i] == "-o" or sys.argv[i] == "--outfile":
                outFile=sys.argv[i+1]
                i = i + 2
            else:
                print >> sys.stderr,("Error! Wrong argument:%s" % sys.argv[i])
                return 1
        else:
            inFile=sys.argv[i]
            i+=1

    if inFile == "":
        print >> sys.stderr,"Error! Topology file not set."
        return 1
    if fastaFile == "":
        print >> sys.stderr,"Error!  amino acid fasta file not set."
        return 1

    fpout = sys.stdout
    if outFile != "":
        fpout = open(outFile,"w")
        if not fpout:
            print >> sys.stderr, "Failed to write to outfile %s. "%(outFile)
            print >> sys.stderr, "Reset output to stdout."
            fpout = sys.stdout
    sizeAASeqFile = os.path.getsize(fastaFile)

    if sizeAASeqFile > MAX_FASTA_AA_FILE_SIZE:
        print >> sys.stderr, ("size (%d)"%sizeAASeqFile 
                + " of fasta sequence file (%s)"%fastaFile
                + " is over the limit (%d). Exit."% MAX_FASTA_AA_FILE_SIZE)
        return 1

    (idListSeq, annotationListSeq, seqList) = myfunc.ReadFasta(fastaFile)
    if idListSeq == None:
        print >> sys.stderr, "%s exit with error."%sys.argv[0]
        return 1
    elif idListSeq < 1:
        print >> sys.stderr, ("Warning! zero aa sequences have" 
                + " been read in for file %s" %fastaFile)
    aaSeqDict={}
    for i in xrange (len(idListSeq)):
        aaSeqDict[idListSeq[i]] = seqList[i]


    fpin = open (inFile, "rb")
    if not fpin:
        print >> sys.stderr, "Failed to open input file %s"%(inFile)
        return -1
    unprocessedBuffer=""
    isEOFreached = False
    processedTopoIDSet = set([])
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if len(buff) < BLOCK_SIZE:
            isEOFreached=True
        buff = unprocessedBuffer + buff
        recordList = []
        unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff,recordList, isEOFreached)
        if len(recordList) > 0: 
            idListTopo = [r[0] for r in recordList]
            topoList = [r[2] for r in recordList]
            Topo2TMFrag(idListTopo, topoList,aaSeqDict, processedTopoIDSet, fpout)
        if isEOFreached == True:
            break
    fpin.close()

    if fpout != None and fpout != sys.stdout:
        fpout.close()
コード例 #7
0
def main():  #{{{
    numArgv = len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1
    outpath = "./"
    inFile = ""
    dgFile = ""
    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            isNonOptionArg = False
            i = i + 1
        elif sys.argv[i] == "--":
            isNonOptionArg = True
            i = i + 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] == "-h" or sys.argv[i] == "--help":
                PrintHelp()
                return 1
            elif sys.argv[i] == "-i" or sys.argv[i] == "--infile":
                inFile = sys.argv[i + 1]
                i = i + 2
            elif sys.argv[i] == "-dg" or sys.argv[i] == "--dg":
                dgFile = sys.argv[i + 1]
                i = i + 2
            elif sys.argv[i] == "-outpath" or sys.argv[i] == "--outpath":
                outpath = sys.argv[i + 1]
                i = i + 2
            else:
                print >> sys.stderr, ("Error! Wrong argument:%s" % sys.argv[i])
                return 1
        else:
            inFile = sys.argv[i]
            i += 1

    if inFile == "":
        print >> sys.stderr, "Error! Topology file not set."
        return 1
    if dgFile == "":
        print >> sys.stderr, "Error!  dgFile not set."
        return 1

    os.system("mkdir -p %s" % outpath)
    dgScoreDict = lcmp.ReadDGScore(dgFile)
    if dgScoreDict == {}:
        print >> sys.stderr, "Read DG score failed. exit"
        return 1

    rootname = os.path.basename(os.path.splitext(inFile)[0])
    outFileTopoDG = outpath + os.sep + rootname + ".topowithdgscore"

    fpTopoDG = myfunc.myopen(outFileTopoDG, None, "w", False)
    if fpTopoDG == None:
        return 1

    fpin = open(inFile, "rb")
    if not fpin:
        print >> sys.stderr, "Failed to open input file %s" % (inFile)
        return -1
    unprocessedBuffer = ""
    isEOFreached = False
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if len(buff) < BLOCK_SIZE:
            isEOFreached = True
        buff = unprocessedBuffer + buff
        recordList = []
        unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff, recordList,
                                                       isEOFreached)
        if len(recordList) > 0:
            idListTopo = [r[0] for r in recordList]
            annotationListTopo = [r[1] for r in recordList]
            topoList = [r[2] for r in recordList]
            TopoAddDGscore(idListTopo, annotationListTopo, topoList,
                           dgScoreDict, fpTopoDG)
        if isEOFreached == True:
            break
    myfunc.myclose(fpTopoDG)
    print outFileTopoDG, "output"
コード例 #8
0
def main(g_params):  #{{{
    numArgv = len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outFile = ""
    idList = []
    idListFile = ""
    fastaFile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            idList.append(sys.argv[i])
            isNonOptionArg = False
            i = i + 1
        elif sys.argv[i] == "--":
            isNonOptionArg = True
            i = i + 1
        elif sys.argv[i][0] == "-":
            if (sys.argv[i] in ["-h", "--help"]):
                PrintHelp()
                return 1
            elif (sys.argv[i] in ["-l", "--l", "-list", "--list"]):
                idListFile = sys.argv[i + 1]
                i = i + 2
            elif (sys.argv[i] in ["-f", "--f", "-fasta", "--fasta"]):
                fastaFile = sys.argv[i + 1]
                i = i + 2
            elif (sys.argv[i] in ["-o", "--o", "-outfile", "--outfile"]):
                outFile = sys.argv[i + 1]
                i = i + 2
            elif (sys.argv[i] in ["-mine", "--mine"]):
                g_params['min_evalue'] = float(sys.argv[i + 1])
                g_params['isEvalueSet'] = True
                i = i + 2
            elif (sys.argv[i] in ["-maxe", "--maxe"]):
                g_params['max_evalue'] = float(sys.argv[i + 1])
                g_params['isEvalueSet'] = True
                i = i + 2
            else:
                print(("Error! Wrong argument:%s" % sys.argv[i]),
                      file=sys.stderr)
                return 1
        else:
            idList.append(sys.argv[i])
            i += 1

    if fastaFile == "":
        print("Fatal!  fasta file not set. Exit.", file=sys.stderr)
        return 1
    elif not os.path.exists(fastaFile):
        print("Fatal! fasta file %s does not exist. Exit." % (fastaFile),
              file=sys.stderr)
        return 1

    if os.path.exists(idListFile):
        idList += myfunc.ReadIDList(idListFile)

    if len(idList) > 0:
        isIDSet = True
    else:
        isIDSet = False

    if not g_params['isEvalueSet'] and not isIDSet:
        print("Error! no ID nor evalue threshold is set. Eixt",
              file=sys.stderr)
        return 1

    idListSet = set(idList)
    fpout = myfunc.myopen(filename=outFile,
                          default_fp=sys.stdout,
                          mode="w",
                          isRaise=False)

    fpin = open(fastaFile, "r")
    if not fpin:
        print("Failed to open fastafile %s" % (fastaFile), file=sys.stderr)
        return -1
    unprocessedBuffer = ""
    isEOFreached = False
    BLOCK_SIZE = g_params['BLOCK_SIZE']
    isEvalueSet = g_params['isEvalueSet']
    min_evalue = g_params['min_evalue']
    max_evalue = g_params['max_evalue']
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if len(buff) < BLOCK_SIZE:
            isEOFreached = True
        buff = unprocessedBuffer + buff
        recordList = []
        unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff, recordList,
                                                       isEOFreached)
        if len(recordList) > 0:
            for r in recordList:
                if ((not isIDSet) or (r[0] in idListSet)):
                    if (not isEvalueSet or r[1].lower().find('evalue') < 0):
                        fpout.write(">%s\n" % r[1])
                        fpout.write("%s\n" % r[2])
                    else:
                        evalue = myfunc.GetEvalueFromAnnotation(r[1])
                        if (evalue == None or
                            (evalue >= min_evalue and evalue <= max_evalue)):
                            fpout.write(">%s\n" % r[1])
                            fpout.write("%s\n" % r[2])

        if isEOFreached == True:
            break
    fpin.close()
    myfunc.myclose(fpout)
コード例 #9
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    seqdbfile = ""
    infile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outfile", "--outfile"]:
                outfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-seqdb", "--seqdb"]:
                seqdbfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1
    if infile == "":
        print >> sys.stderr, "topofile not set"
        return 1
    elif not os.path.exists(infile):
        print >> sys.stderr, "topofile %s does not exist" % (infile)
        return 1
#     if seqdbfile == "":
#         print >> sys.stderr, "seqdbfile file not set"
#         return 1
#     elif not os.path.exists(seqdbfile):
#         print >> sys.stderr, "seqdbfile file %s does not exist"%(seqdbfile)
#         return 1
#     seqDict = GetSeqDict(seqdbfile)
#     if seqDict == {}:
#         print >> sys.stderr, "Failed to read seqdbfile %s"%(seqdbfile)
#         return 1

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    fpin = open(infile, "rb")
    if not fpin:
        print >> sys.stderr, "Failed to open input file %s" % (infile)
        return 1
    unprocessedBuffer = ""
    unprocessedRecordList = []
    isEOFreached = False
    processedTopoIDSet = set([])
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if len(buff) < BLOCK_SIZE:
            isEOFreached = True
        buff = unprocessedBuffer + buff
        recordList = unprocessedRecordList
        unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff, recordList,
                                                       isEOFreached)
        numRecord = len(recordList)
        if numRecord > 0:
            numPair = numRecord / 2
            for i in xrange(numPair):
                rd1 = recordList[2 * i]
                rd2 = recordList[2 * i + 1]
                id1 = rd1[0]
                topo1 = rd1[2]
                id2 = rd2[0]
                topo2 = rd2[2]
                stat1 = GetTopoAlignStat(topo1, topo2)
                stat2 = GetTopoAlignStat(topo2, topo1)
                WriteStat(id1, id2, stat1, fpout)
                WriteStat(id2, id1, stat1, fpout)

        if numRecord % 2 == 1:
            unprocessedRecordList = [recordList[-1]]
        else:
            unprocessedRecordList = []

        if isEOFreached == True:
            break
    fpin.close()

    myfunc.myclose(fpout)
コード例 #10
0
def main(g_params):#{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    seqdbfile = ""
    infile = ""

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outfile", "--outfile"]:
                outfile = argv[i+1]
                i += 2
            elif argv[i] in ["-seqdb", "--seqdb"]:
                seqdbfile = argv[i+1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1
    if infile == "":
        print >> sys.stderr, "topofile not set"
        return 1
    elif not os.path.exists(infile):
        print >> sys.stderr, "topofile %s does not exist"%(infile)
        return 1
#     if seqdbfile == "":
#         print >> sys.stderr, "seqdbfile file not set"
#         return 1
#     elif not os.path.exists(seqdbfile):
#         print >> sys.stderr, "seqdbfile file %s does not exist"%(seqdbfile)
#         return 1
#     seqDict = GetSeqDict(seqdbfile)
#     if seqDict == {}:
#         print >> sys.stderr, "Failed to read seqdbfile %s"%(seqdbfile)
#         return 1

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    fpin = open (infile, "rb");
    if not fpin:
        print >> sys.stderr, "Failed to open input file %s"%(infile)
        return 1
    unprocessedBuffer="";
    isEOFreached = False;
    processedTopoIDSet = set([]);
    while 1:
        buff = fpin.read(BLOCK_SIZE);
        if len(buff) < BLOCK_SIZE:
            isEOFreached=True;
        buff = unprocessedBuffer + buff;
        recordList = [];
        unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff,recordList, isEOFreached);
        if len(recordList) > 0: 
            idListTopo = [r[0] for r in recordList];
            topoList = [r[2] for r in recordList];
            for i in xrange(len(idListTopo)):
                seqid = idListTopo[i]
                topo = topoList[i]

                posTM = myfunc.GetTMPosition(topo)
                if len(posTM) > 0:
                    cnt = 0
                    for (b,e) in posTM:
                        seg = topo[b:e]
                        fpout.write("%s\t%4d\t%s\n"%(seqid, cnt+1, seg))
                        cnt += 1

        if isEOFreached == True:
            break;
    fpin.close();

    myfunc.myclose(fpout)
コード例 #11
0
def main(g_params):  #{{{
    numArgv = len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    isQuiet = False
    isPrintIDName = True
    outfile = ""
    topofile = ""
    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            isNonOptionArg = False
            topofile = sys.argv[i]
            i = i + 1
        elif sys.argv[i] == "--":
            isNonOptionArg = True
            i = i + 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif sys.argv[i] in ["-o", "--o", "-out"]:
                outfile = sys.argv[i + 1]
                i = i + 2
            elif sys.argv[i] in ["-q", "--q"]:
                isQuiet = True
                i = i + 1
            elif sys.argv[i] in ["-ni", "--ni", "-noid"]:
                isPrintIDName = False
                i = i + 1
            else:
                print "Error! Wrong argument:", sys.argv[i]
                return 1
        else:
            topofile = sys.argv[i]
            i = i + 1

    if topofile == "":
        print >> sys.stderr, "topofile not set. Exit."
        return 1
    elif not os.path.exists(topofile):
        print >> sys.stderr, "topofile %s doe not exist. Exit." % topofile
        return 1

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    fpin = open(topofile, "rb")
    if not fpin:
        print >> sys.stderr, "Failed to open input file %s" % (topofile)
        return 1
    unprocessedBuffer = ""
    isEOFreached = False
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if len(buff) < BLOCK_SIZE:
            isEOFreached = True
        buff = unprocessedBuffer + buff
        recordList = []
        unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff, recordList,
                                                       isEOFreached)
        if len(recordList) > 0:
            for rd in recordList:
                if isPrintIDName:
                    fpout.write("%s\t" % rd[0])
                fpout.write("%d\n" % myfunc.CountTM(rd[2]))
        if isEOFreached == True:
            break
    fpin.close()

    myfunc.myclose(fpout)

    return 0
コード例 #12
0
def ValidateSeq(rawseq, seqinfo, g_params):  #{{{
    # seq is the chunk of fasta file
    # seqinfo is a dictionary
    # return (filtered_seq)
    rawseq = re.sub(r'[^\x00-\x7f]', r' ',
                    rawseq)  # remove non-ASCII characters
    rawseq = re.sub(r'[\x0b]', r' ',
                    rawseq)  # filter invalid characters for XML
    filtered_seq = ""
    # initialization
    for item in ['errinfo_br', 'errinfo', 'errinfo_content', 'warninfo']:
        if item not in seqinfo:
            seqinfo[item] = ""

    seqinfo['isValidSeq'] = True

    seqRecordList = []
    myfunc.ReadFastaFromBuffer(rawseq, seqRecordList, True, 0, 0)
    # filter empty sequences and any sequeces shorter than MIN_LEN_SEQ or longer
    # than MAX_LEN_SEQ
    newSeqRecordList = []
    li_warn_info = []
    isHasEmptySeq = False
    isHasShortSeq = False
    isHasLongSeq = False
    isHasDNASeq = False
    cnt = 0
    for rd in seqRecordList:
        seq = rd[2].strip()
        seqid = rd[0].strip()
        if len(seq) == 0:
            isHasEmptySeq = 1
            msg = "Empty sequence %s (SeqNo. %d) is removed." % (seqid,
                                                                 cnt + 1)
            li_warn_info.append(msg)
        elif len(seq) < g_params['MIN_LEN_SEQ']:
            isHasShortSeq = 1
            msg = "Sequence %s (SeqNo. %d) is removed since its length is < %d." % (
                seqid, cnt + 1, g_params['MIN_LEN_SEQ'])
            li_warn_info.append(msg)
        elif len(seq) > g_params['MAX_LEN_SEQ']:
            isHasLongSeq = True
            msg = "Sequence %s (SeqNo. %d) is removed since its length is > %d." % (
                seqid, cnt + 1, g_params['MAX_LEN_SEQ'])
            li_warn_info.append(msg)
        elif myfunc.IsDNASeq(seq):
            isHasDNASeq = True
            msg = "Sequence %s (SeqNo. %d) is removed since it looks like a DNA sequence." % (
                seqid, cnt + 1)
            li_warn_info.append(msg)
        else:
            newSeqRecordList.append(rd)
        cnt += 1
    seqRecordList = newSeqRecordList

    numseq = len(seqRecordList)

    if numseq < 1:
        seqinfo['errinfo_br'] += "Number of input sequences is 0!\n"
        t_rawseq = rawseq.lstrip()
        if t_rawseq and t_rawseq[0] != '>':
            seqinfo[
                'errinfo_content'] += "Bad input format. The FASTA format should have an annotation line start with '>'.\n"
        if len(li_warn_info) > 0:
            seqinfo['errinfo_content'] += "\n".join(li_warn_info) + "\n"
        if not isHasShortSeq and not isHasEmptySeq and not isHasLongSeq and not isHasDNASeq:
            seqinfo[
                'errinfo_content'] += "Please input your sequence in FASTA format.\n"

        seqinfo['isValidSeq'] = False
    elif numseq > g_params['MAX_NUMSEQ_PER_JOB']:
        seqinfo[
            'errinfo_br'] += "Number of input sequences exceeds the maximum (%d)!\n" % (
                g_params['MAX_NUMSEQ_PER_JOB'])
        seqinfo[
            'errinfo_content'] += "Your target sequence field has %d sequences. " % (
                numseq)
        seqinfo[
            'errinfo_content'] += "However, the maximal allowed sequences for this field is %d. " % (
                g_params['MAX_NUMSEQ_PER_JOB'])
        #seqinfo['errinfo_content'] += "Please split your query into smaller files and submit again.\n"
        seqinfo['isValidSeq'] = False
    else:
        li_badseq_info = []
        if 'isForceRun' in seqinfo and seqinfo[
                'isForceRun'] and numseq > g_params['MAX_NUMSEQ_FOR_FORCE_RUN']:
            seqinfo['errinfo_br'] += "Invalid input!"
            seqinfo['errinfo_content'] += "You have chosen the \"Force Run\" mode. "\
                    "The maximum allowable number of sequences of a job is %d. "\
                    "However, your input has %d sequences."%(g_params['MAX_NUMSEQ_FOR_FORCE_RUN'], numseq)
            seqinfo['isValidSeq'] = False

# checking for bad sequences in the query

    if seqinfo['isValidSeq']:
        for i in xrange(numseq):
            seq = seqRecordList[i][2].strip()
            anno = seqRecordList[i][1].strip().replace('\t', ' ')
            seqid = seqRecordList[i][0].strip()
            seq = seq.upper()
            seq = re.sub("[\s\n\r\t]", '', seq)
            li1 = [
                m.start()
                for m in re.finditer("[^ABCDEFGHIKLMNPQRSTUVWYZX*-]", seq)
            ]
            if len(li1) > 0:
                for j in xrange(len(li1)):
                    msg = "Bad letter for amino acid in sequence %s (SeqNo. %d) "\
                            "at position %d (letter: '%s')"%(seqid, i+1,
                                    li1[j]+1, seq[li1[j]])
                    li_badseq_info.append(msg)

        if len(li_badseq_info) > 0:
            seqinfo[
                'errinfo_br'] += "There are bad letters for amino acids in your query!\n"
            seqinfo['errinfo_content'] = "\n".join(li_badseq_info) + "\n"
            seqinfo['isValidSeq'] = False

# convert some non-classical letters to the standard amino acid symbols
# Scheme:
#    out of these 26 letters in the alphabet,
#    B, Z -> X
#    U -> C
#    *, - will be deleted
    if seqinfo['isValidSeq']:
        li_newseq = []
        for i in xrange(numseq):
            seq = seqRecordList[i][2].strip()
            anno = seqRecordList[i][1].strip()
            seqid = seqRecordList[i][0].strip()
            seq = seq.upper()
            seq = re.sub("[\s\n\r\t]", '', seq)
            anno = anno.replace('\t', ' ')  #replace tab by whitespace

            li1 = [m.start() for m in re.finditer("[BZ]", seq)]
            if len(li1) > 0:
                for j in xrange(len(li1)):
                    msg = "Amino acid in sequence %s (SeqNo. %d) at position %d "\
                            "(letter: '%s') has been replaced by 'X'"%(seqid,
                                    i+1, li1[j]+1, seq[li1[j]])
                    li_warn_info.append(msg)
                seq = re.sub("[BZ]", "X", seq)

            li1 = [m.start() for m in re.finditer("[U]", seq)]
            if len(li1) > 0:
                for j in xrange(len(li1)):
                    msg = "Amino acid in sequence %s (SeqNo. %d) at position %d "\
                            "(letter: '%s') has been replaced by 'C'"%(seqid,
                                    i+1, li1[j]+1, seq[li1[j]])
                    li_warn_info.append(msg)
                seq = re.sub("[U]", "C", seq)

            li1 = [m.start() for m in re.finditer("[*]", seq)]
            if len(li1) > 0:
                for j in xrange(len(li1)):
                    msg = "Translational stop in sequence %s (SeqNo. %d) at position %d "\
                            "(letter: '%s') has been deleted"%(seqid,
                                    i+1, li1[j]+1, seq[li1[j]])
                    li_warn_info.append(msg)
                seq = re.sub("[*]", "", seq)

            li1 = [m.start() for m in re.finditer("[-]", seq)]
            if len(li1) > 0:
                for j in xrange(len(li1)):
                    msg = "Gap in sequence %s (SeqNo. %d) at position %d "\
                            "(letter: '%s') has been deleted"%(seqid,
                                    i+1, li1[j]+1, seq[li1[j]])
                    li_warn_info.append(msg)
                seq = re.sub("[-]", "", seq)

            # check the sequence length again after potential removal of
            # translation stop
            if len(seq) < g_params['MIN_LEN_SEQ']:
                isHasShortSeq = 1
                msg = "Sequence %s (SeqNo. %d) is removed since its length is < %d (after removal of translation stop)." % (
                    seqid, i + 1, g_params['MIN_LEN_SEQ'])
                li_warn_info.append(msg)
            else:
                li_newseq.append(">%s\n%s" % (anno, seq))

        filtered_seq = "\n".join(li_newseq)  # seq content after validation
        seqinfo['numseq'] = len(li_newseq)
        seqinfo['warninfo'] = "\n".join(li_warn_info) + "\n"

    seqinfo['errinfo'] = seqinfo['errinfo_br'] + seqinfo['errinfo_content']
    return filtered_seq
コード例 #13
0
ファイル: test.py プロジェクト: vam-sin/bioinfo-toolbox
def main():  #{{{
    if 0:  #{{{
        strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo"
        strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo"
        strProtein1 = "id1"
        strProtein2 = "id2"
        fpLog = sys.stdout
        class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew(
            strTop1, strTop2, strProtein1, strProtein2, fpLog)
        # Note: calling the int, float, string will not change their original value
        # calling the dict, list will change their original value
        print "strTop1:", strTop1
        print "strTop2:", strTop2
#}}}
    if 0:  #{{{
        PrintFuncName()
        print("this file name is: %s" % __file__)
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        lines = fp.readlines()
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        BLOCK_SIZE = 100000
        fp = open(filename, "r")
        buff = fp.read(BLOCK_SIZE)
        while buff:
            buff = fp.read(BLOCK_SIZE)
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        line = fp.readline()
        while line:
            line = fp.readline()
        fp.close()
        #}}}
    if 0:  #{{{
        try:
            BLOCK_SIZE = 100000
            infile = sys.argv[1]
            fpin = open(infile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                if len(recordList) > 0:
                    for record in recordList:
                        sys.stdout.write(">%s\n" % record[1])
                        sys.stdout.write("%s\n" % record[2])
                if isEOFreached == True:
                    break
            fpin.close()
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        try:
            infile = sys.argv[1]
            (annoList, seqList) = myfunc.ReadFasta_without_id(infile)
            for i in xrange(len(seqList)):
                sys.stdout.write(">%s\n" % annoList[i])
                sys.stdout.write("%s\n" % seqList[i])
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr"
        if IsDuplicatedByHHSearch(hhrfile):
            print "yes"

#}}}
    if 0:  #{{{
        import pairlistwithfamid2pairaln_by_msa
        seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV"
        seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP"
        seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment(
            seq1, seq2)
        print alignFactor
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{
        import my_extractdb
        #miniking my_extractdb.py see which one is faster
        try:
            dbname = sys.argv[1]
            idlistfile = sys.argv[2]
            cls = myfunc.MyDB(dbname)
            if cls.failure:
                print >> sys.stderr, "MyDB init failed"
            else:
                idlist = open(idlistfile, "r").read().split("\n")
                fpout = sys.stdout
                for seqid in idlist:
                    if seqid:
                        record = cls.GetRecord(seqid)
                        fpout.write(record)
            #             for rd in  cls.GetAllRecord():
            #                 print rd
#                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
#                print (seqid, anno, seq)
        except IndexError:
            print "error"
            pass
#}}}
    if 0:  #{{{ #test ReadLineByBlock
        try:
            infile = sys.argv[1]
            from myfunc import ReadLineByBlock
            cls = ReadLineByBlock(infile)
            lines = cls.readlines()
            while lines != None:
                for line in lines:
                    print line
                lines = cls.readlines()

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test speed of ReadLineByBlock
        # ReadLineByBlock is about 3 times fater than file.readline()
        try:
            from myfunc import ReadLineByBlock
            infile = sys.argv[1]

            start = time.time()
            hdl = ReadLineByBlock(infile)
            lines = hdl.readlines()
            while lines != None:
                lines = hdl.readlines()
            hdl.close()
            end = time.time()
            msg = "Reading %s by ReadLineByBlock costs %.3fs seconds"
            print msg % (infile, (end - start))

            start = time.time()
            hdl = open(infile, "r")
            line = hdl.readline()
            while line:
                line = hdl.readline()
            hdl.close()
            end = time.time()
            msg = "Reading %s by readline() costs %.3fs seconds"
            print msg % (infile, (end - start))

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test readline
        try:
            infile = sys.argv[1]
            fp = open(infile, "r")
            line = fp.readline()
            while line:
                print line
                line = fp.readline()
            fp.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test the speed of GetFirstWord
        try:
            nloop = int(sys.argv[1])
            string = "kjdafk jasdfj j"
            #string = "askdf askdf "
            #            string = "kajsdfasdfsdfjakasjdfka"
            #            string = "kajsdfasdf,sdfjakasjdfka"
            delimiter = " \t\r,.\n"
            delimiter = " "
            for i in xrange(nloop):
                #firstword = myfunc.GetFirstWord(string, delimiter)
                #firstword = string.split()[0]
                #firstword = string.partition(" ")[0]
                firstword = myfunc.GetFirstWord(string)
                #pass
                #print firstword
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ # read seq by SeqIO
        from Bio import SeqIO
        try:
            seqfile = sys.argv[1]
            # 1. SeqIO ####################
            start = time.time()
            handle = open(seqfile, "rU")
            cnt = 0
            for record in SeqIO.parse(handle, "fasta"):
                cnt += 1
            handle.close()
            end = time.time()
            msg = "Reading %d sequences by SeqIO costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 2. ReadFasta ####################
            start = time.time()
            seqfile = sys.argv[1]
            (idList, annoList, seqList) = myfunc.ReadFasta(seqfile)
            end = time.time()
            msg = "Reading %d sequences by ReadFasta costs %.3fs seconds"
            print msg % (len(idList), (end - start))

            # 3. ReadFasta from buffer
            BLOCK_SIZE = 100000
            start = time.time()
            cnt = 0
            fpin = open(seqfile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                cnt += len(recordList)
                if isEOFreached == True:
                    break
            fpin.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 4. ReadFastaByBlock ####################
            start = time.time()
            seqfile = sys.argv[1]
            hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0)
            if hdl.failure:
                print >> sys.stderr, "Failed to init ReadFastaByBlock"
                return 1
            recordList = hdl.readseq()
            cnt = 0
            while recordList != None:
                cnt += len(recordList)
                #                 for rd in recordList:
                #                     print ">%s"%rd.description
                #                     print rd.seq
                recordList = hdl.readseq()
            hdl.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds"
            print msg % (cnt, (end - start))
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ #test RemoveUnnecessaryGap
        try:
            infile = sys.argv[1]
            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)
            seqList = lcmp.RemoveUnnecessaryGap_old(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)

            seqList = lcmp.RemoveUnnecessaryGap(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test ReadMPAByBlock
        try:
            infile = sys.argv[1]
            hdl = myfunc.ReadMPAByBlock(infile)
            if hdl.failure:
                return
            recordList = hdl.readseq()
            while recordList != None:
                for rd in recordList:
                    #print rd.seqid
                    print ">%s" % (rd.description)
                    print "%s" % (myfunc.mpa2seq(rd.mpa))
                recordList = hdl.readseq()
            hdl.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{ #test subprocess
        import glob
        #invoke shell explicitly, not very good, may have security problems
        subprocess.call("seq 10", shell=True)
        subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True)
        subprocess.call("ls topo*.py", shell=True)
    if 1:  #{{{ #test subprocess
        import glob
        #invoke shell implicitly, recommended way
        subprocess.call(["seq", "10"], shell=False)
        subprocess.call(["echo", "wait for 1 seconds..."])
        subprocess.call(["sleep", "1"])
        try:
            print subprocess.check_call(["ls",
                                         "topo*.py"])  #This will not work
        except subprocess.CalledProcessError, e:
            print "error message:", e
        subprocess.call(["ls"] + glob.glob("topo*.py"))
コード例 #14
0
def main():  #{{{
    numArgv = len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    infile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = sys.argv[i]
            isNonOptionArg = False
            i = i + 1
        elif sys.argv[i] == "--":
            isNonOptionArg = True
            i = i + 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] == "-h" or sys.argv[i] == "--help":
                PrintHelp()
                return 1
            elif sys.argv[i] in ["-o", "--o"]:
                outfile = sys.argv[i + 1]
                i = i + 2
            else:
                print >> sys.stderr, ("Error! Wrong argument:%s" % sys.argv[i])
                return 1
        else:
            infile = sys.argv[i]
            i += 1

    if infile == "":
        print >> sys.stderr, "Error! MSA file not set."
        return 1
    elif not os.path.exists(infile):
        print >> sys.stderr, "Error! MSA file %s does not exist." % (infile)
        return 1

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    isFirstSeqSet = False
    targetSeqID = ""
    targetSeqAnno = ""
    targetSeq = ""
    targetSeqBeg = 0
    #starting position in alignment without terminal gaps
    targetSeqEnd = 0

    fpin = open(infile, "rb")
    if not fpin:
        print >> sys.stderr, "Failed to open input file %s" % (infile)
        return -1
    unprocessedBuffer = ""
    isEOFreached = False
    processedTopoIDSet = set([])
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if len(buff) < BLOCK_SIZE:
            isEOFreached = True
        buff = unprocessedBuffer + buff
        recordList = []
        unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff, recordList,
                                                       isEOFreached)
        if len(recordList) > 0:
            if not isFirstSeqSet:
                targetSeqID = recordList[0][0]
                targetSeqAnno = recordList[0][1]
                targetSeq = recordList[0][2]
                if not targetSeqID == "target":
                    print >> sys.stderr, "Error, the first sequence is not target sequence in file %s" % (
                        infile)
                    return 1
                (targetSeqBeg,
                 targetSeqEnd) = GetPositionTermGapless(targetSeq)
                isFirstSeqSet = True
            for rd in recordList:
                print >> fpout, ">%s" % (rd[1])
                print >> fpout, "%s" % (rd[2][targetSeqBeg:targetSeqEnd])
        if isEOFreached == True:
            break
    fpin.close()
    myfunc.myclose(fpout)
コード例 #15
0
def main():  #{{{
    numArgv = len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    infile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = sys.argv[i]
            isNonOptionArg = False
            i = i + 1
        elif sys.argv[i] == "--":
            isNonOptionArg = True
            i = i + 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] == "-h" or sys.argv[i] == "--help":
                PrintHelp()
                return 1
            elif sys.argv[i] in ["-o", "--o"]:
                outfile = sys.argv[i + 1]
                i = i + 2
            else:
                print >> sys.stderr, ("Error! Wrong argument:%s" % sys.argv[i])
                return 1
        else:
            infile = sys.argv[i]
            i += 1

    if infile == "":
        print >> sys.stderr, "Error! MSA file not set."
        return 1
    elif not os.path.exists(infile):
        print >> sys.stderr, "Error! MSA file %s does not exist." % (infile)
        return 1

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    isFirstSeqSet = False
    targetSeqID = ""
    targetSeq = ""
    trimmedSeqPosList = []  #segment list after removal of gaps in
    #the target sequence
    trimmedseqLength = 0
    fpin = open(infile, "rb")
    if not fpin:
        print >> sys.stderr, "Failed to open input file %s" % (infile)
        return -1
    unprocessedBuffer = ""
    isEOFreached = False
    processedTopoIDSet = set([])
    cntseq = 0
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if len(buff) < BLOCK_SIZE:
            isEOFreached = True
        buff = unprocessedBuffer + buff
        recordList = []
        unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff, recordList,
                                                       isEOFreached)
        if len(recordList) > 0:
            if not isFirstSeqSet:
                targetSeqID = recordList[0][0]
                targetSeq = recordList[0][2]
                if not targetSeqID == "target":
                    print >> sys.stderr, "Error, the first sequence is not target sequence in file %s" % (
                        infile)
                    return 1
                trimmedSeqPosList = GetTrimmedPosition(targetSeq)
                lengthSegList = [(e - b) for (b, e) in trimmedSeqPosList]
                trimmedseqLength = sum(lengthSegList)
                trimmedseqLength = 100
                isFirstSeqSet = True
            for rd in recordList:
                desp = ""
                if rd[0] == "target":
                    desp = "target/1-%d" % (trimmedseqLength)
                else:
                    cntseq += 1
                    desp = "sequence%07d/1-%d" % (cntseq, trimmedseqLength)
                slist = [rd[2][p[0]:p[1]] for p in trimmedSeqPosList]
                trimmedseq = "".join(slist)
                trimmedseq = trimmedseq.replace("X", GAP)
                print >> fpout, ">%s" % (desp)
                print >> fpout, "%s" % (trimmedseq)
        if isEOFreached == True:
            break
    fpin.close()
    myfunc.myclose(fpout)