Esempio n. 1
0
def GetTopoDict(topoDB, idList):
    hdl = myfunc.MyDB(topoDB)
    if hdl.failure:
        return {}
    dt = {}
    for seqid in idList:
        data = hdl.GetRecord(seqid)
        if data:
            (tmp_id, tmp_anno, tmp_seq) = myfunc.ExtractFromSeqWithAnno(data)
            dt[seqid] = tmp_seq
    hdl.close()
    return dt
Esempio n. 2
0
def createHitDB(pfamList, prot_name, work_dir):
    hdl = myfunc.MyDB(cddseqdb)
    if hdl.failure:
        print "Error"
        return 1
    with open(work_dir + prot_name + ".hits.db.temp", "w") as outFile:
        for pfamid in pfamList:
            record = hdl.GetRecord(pfamid)
            if record:
                outFile.write(record)
        hdl.close()

    os.system("python my_uniqueseq.py " + work_dir + prot_name +
              ".hits.db.temp")
Esempio n. 3
0
def createHitDB_mydb(pfamList, prot_name, work_dir, pfamseqdb):  # {{{
    """Create the blast seqdb from pfam_scan hits, using the MyDB version of pfamfullseqdb"""
    outfile_seqdb = os.path.join(work_dir, prot_name + ".hits.db.temp")
    hdl = myfunc.MyDB(pfamseqdb)
    if hdl.failure:
        print("Failed to open pfamseqdb %s with MyDB()" % (pfamseqdb))
        return 1
    with open(outfile_seqdb, "w") as outFile:
        for pfamid in pfamList:
            record = hdl.GetRecord(pfamid)
            if record:
                outFile.write(record)
        hdl.close()

    os.system("python my_uniqueseq.py " + outfile_seqdb)
Esempio n. 4
0
def MatchMSATopo_using_topodb(
        msafile,
        topodb,
        isIgnoreBadseq,  #{{{
        method_match,
        outfile):
    hdl_topo = myfunc.MyDB(topodb)
    if hdl_topo.failure:
        return 1

    hdl = myfunc.ReadFastaByBlock(msafile)
    if hdl.failure:
        return 1

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            topowithanno = hdl_topo.GetRecord(rd.seqid)
            if topowithanno != None:
                (topoid, topoanno,
                 topo) = myfunc.ExtractFromSeqWithAnno(topowithanno)
            else:
                print("topo not found for ID %s" % (rd.seqid), file=sys.stderr)
                topo = ""
            matchedtopo = MatchSeqToTopo(rd.seq, topo, method_match)
            if not (matchedtopo == "BADSEQ" and isIgnoreBadseq):
                print(">%s" % (rd.description), file=fpout)
                print("%s" % (matchedtopo), file=fpout)
        recordList = hdl.readseq()

    myfunc.myclose(fpout)
    hdl.close()
    hdl_topo.close()

    return 0
Esempio n. 5
0
        except subprocess.CalledProcessError, e:
            print e
            return 1

    if myfunc.checkfile(pfamid2seqidFile, "pfamid2seqidFile") != 0:
        return 1

    if myfunc.checkfile("%s0.db" % topodb, "topodb") != 0:
        return 1

    if myfunc.checkfile("%s0.db" % seqdb, "seqdb") != 0:
        return 1

    pfamid2seqidDict = myfunc.ReadFam2SeqidMap(pfamid2seqidFile)

    hdl_topo = myfunc.MyDB(topodb)
    if not hdl_topo.failure:
        idSet_topo = set(hdl_topo.indexedIDList)
    else:
        idSet_topo = set([])
        print >> sys.stderr, "Failed to open topology database %s" % (topodb)
        return 1

    hdl_seq = myfunc.MyDB(seqdb)
    if hdl_seq.failure:
        print >> sys.stderr, "Failed to open sequence database %s" % (seqdb)
        return 1

    GetTMProList_per_family(pfamid2seqidDict, idSet_topo, hdl_seq, hdl_topo,
                            outpath)
Esempio n. 6
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    SPE_PAIR_LIST = [(2, 1), (2, 4), (2, 6), (2, 8), (3, 6), (3, 7), (4, 6),
                     (4, 8), (4, 10), (5, 7), (5, 10), (6, 8), (6, 10),
                     (6, 12), (7, 14), (8, 10), (8, 12), (10, 12), (10, 13),
                     (11, 13), (12, 14)]

    outfile = ""

    infile = ""
    pfamDefFile = "%s/data/pfam/pfam26.0/Pfam-A.clans.tsv" % (DATADIR3)
    signalpFile = "%s/wk/MPTopo/pfamAna_refpro/pred_signalp/refpro20120604-celluar.selmaxlength-m1.nr100.signalp_list" % (
        DATADIR3)

    #seqid2clanidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.seqid2clanid"%(DATADIR3)
    #seqid2pfamidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.seqid2pfamid"%(DATADIR3)
    seqid2clanidMapFile = ""
    seqid2pfamidMapFile = ""
    tm_pfamidListFile = ""
    tm_clanidListFile = ""
    pfamid2seqidMapFile = ""
    clanid2seqidMapFile = ""
    dbname_predTM = ""
    pairlistwithpfamidFile = ""

    pfamtype = ""

    pairListFile = ""

    #classList_TableNumTMHeatMap = ["ALL", "RMSP"]
    classList_TableNumTMHeatMap = ["ALL"]

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile"]:
                (outfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-outpath", "--outpath"]:
                (g_params['outpath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-l", "--l"]:
                (fileListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfamdef", "--pfamdef"]:
                (pfamDefFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-signalp", "--signalp"]:
                (signalpFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-mp", "--mp"]:
                g_params[
                    'pairwise_comparison_method'], i = myfunc.my_getopt_int(
                        argv, i)
            elif argv[i] in ["-mindiffpair", "--mindiffpair"]:
                g_params['mindiffpair'], i = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-pfamtype", "--pfamtype"]:
                pfamtype, i = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-clanidlist", "--clanidlist"]:
                (tm_clanidListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfamidlist", "--pfamidlist"]:
                (tm_pfamidListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqid2clanid", "--seqid2clanid"]:
                (seqid2clanidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqid2pfamid", "--seqid2pfamid"]:
                (seqid2pfamidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfamid2seqid", "--pfamid2seqid"]:
                (pfamid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-clanid2seqid", "--clanid2seqid"]:
                (clanid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pairlistwithpfamid", "--pairlistwithpfamid"]:
                (pairlistwithpfamidFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-predTMdbname", "--predTMdbname"]:
                (dbname_predTM, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pairlist", "--pairlist"]:
                (pairListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-winsize", "--winsize"]:
                (g_params['winsize'], i) = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-outname", "--outname"]:
                (g_params['outname'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            elif argv[i] in ["-prokar", "--prokar"]:
                g_params['isOnlyAnaProkar'] = True
                i += 1
            elif argv[i] in ["-eukar", "--eukar"]:
                g_params['isOnlyAnaEukar'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1

    if myfunc.checkfile(
            infile, "%s (line %d): infile" %
        (__file__, inspect.currentframe().f_lineno)) != 0:
        return 1

    dirpath = myfunc.my_dirname(infile)

    # try to obtain Pfam family tag
    tag = ""
    if pfamtype != "":
        if pfamtype.upper().find("FAM") != -1:
            tag = ".Family"
        elif pfamtype.upper().find("DOM") != -1:
            tag = ".Domain"
        elif pfamtype.upper().find("REP") != -1:
            tag = ".Repeat"
        elif pfamtype.upper().find("MOT") != -1:
            tag = ".Motif"
        else:
            tag = ""
    else:
        if infile.find(".Family.") != -1:
            tag = ".Family"
        elif infile.find(".Domain.") != -1:
            tag = ".Domain"
        elif infile.find(".Repeat.") != -1:
            tag = ".Repeat"
        elif infile.find(".Motif.") != -1:
            tag = ".Motif"
        else:
            tag = ""

    if seqid2clanidMapFile == "":
        seqid2clanidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.nr100.filter.fragmented.seqid2clanid" % (
            DATADIR3)
    if myfunc.checkfile(
            seqid2clanidMapFile, "%s (line %d): seqid2clanidMapFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if seqid2pfamidMapFile == "":
        seqid2pfamidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.nr100.filter.fragmented.seqid2pfamid" % (
            DATADIR3, tag)
    if myfunc.checkfile(
            seqid2pfamidMapFile, "%s (line %d): seqid2pfamidMapFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if pfamid2seqidMapFile == "":
        pfamid2seqidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.nr100.filter.fragmented.pfamid2seqid" % (
            DATADIR3)
    if myfunc.checkfile(
            pfamid2seqidMapFile, "%s (line %d): pfamid2seqidMapFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if clanid2seqidMapFile == "":
        clanid2seqidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.nr100.filter.fragmented.clanid2seqid" % (
            DATADIR3, tag)
    if myfunc.checkfile(
            clanid2seqidMapFile, "%s (line %d): clanid2seqidMapFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if tm_pfamidListFile == "":
        tm_pfamidListFile = "%s/data/pfam/pfam26.0/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.pfamidlist" % (
            DATADIR3, tag)
    if myfunc.checkfile(
            tm_pfamidListFile, "%s (line %d): tm_pfamidListFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if tm_clanidListFile == "":
        tm_clanidListFile = "%s/data/pfam/pfam26.0/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.clanidlist" % (
            DATADIR3)
    if myfunc.checkfile(
            tm_clanidListFile, "%s (line %d): tm_clanidListFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if dbname_predTM == "":
        dbname_predTM = "%s/wk/MPTopo/pfamAna_refpro/pred_topcons_single_method4/refpro20120604-celluar.selmaxlength-m1.topcons-single_topcons_single.m1.agree-44.RMSP" % (
            DATADIR3)
    if myfunc.checkfile(
            "%s0.db" % (dbname_predTM), "%s (line %d): dbname_predTM" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if g_params['isOnlyAnaProkar']:
        prokarseqidfile = "%s/data/uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.Prokaryota.seqidlist" % (
            DATADIR3)
        g_params['prokarSeqIDSet'] = set(myfunc.ReadIDList(prokarseqidfile))
        if len(g_params['prokarSeqIDSet']) < 1:
            return 1
    if g_params['isOnlyAnaEukar']:
        eukarseqidfile = "%s/data/uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.Eukaryota.seqidlist" % (
            DATADIR3)
        g_params['eukarSeqIDSet'] = set(myfunc.ReadIDList(eukarseqidfile))
        if len(g_params['eukarSeqIDSet']) < 1:
            return 1

    if pairlistwithpfamidFile == "":
        pairlistwithpfamidFile = "%s/../../Pfam-.maxpair100.pairlistwithpfamid" % (
            dirpath)
    if myfunc.checkfile(
            pairlistwithpfamidFile, "%s (line %d): pairlistwithpfamidFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    pfamid_2_seqidpair_Dict = ReadPairListWithFamID(pairlistwithpfamidFile)
    usedPfamIDSet = set(
        pfamid_2_seqidpair_Dict.keys())  # pfamids used in pair selection

    if pairListFile != "":
        li = myfunc.ReadPairList(pairListFile)
        SPE_PAIR_LIST = []
        for tup in li:
            SPE_PAIR_LIST.append((int(tup[0]), int(tup[1])))

    (pfamidDefDict, clanidDefDict) = ReadPfamDefFile(pfamDefFile)
    signalpDict = lcmp.ReadSignalPDict(signalpFile)

    seqid2clanidDict = myfunc.ReadFam2SeqidMap(seqid2clanidMapFile)
    seqid2pfamidDict = myfunc.ReadFam2SeqidMap(seqid2pfamidMapFile)

    clanid2seqidDict = myfunc.ReadFam2SeqidMap(clanid2seqidMapFile)
    pfamid2seqidDict = myfunc.ReadFam2SeqidMap(pfamid2seqidMapFile)

    tm_pfamidList = myfunc.ReadIDList(tm_pfamidListFile)
    tm_clanidList = myfunc.ReadIDList(tm_clanidListFile)

    tm_pfamidSet = set(tm_pfamidList)
    tm_clanidSet = set(tm_clanidList)

    hdl_predTM = myfunc.MyDB(dbname_predTM)
    if not hdl_predTM.failure:
        idSet_TMpro = set(hdl_predTM.indexedIDList)
    else:
        idSet_TMpro = set([])

    #classList_TableNumTMHeatMap = ["ALL", "RMSP", "RMDUP"]
    #alignrangeList = ['FULL_ALIGNED', 'all', 'PART_ALIGNED']
    alignrangeList = ['FULL_ALIGNED']

    if g_params['outpath'] != "" and not os.path.exists(g_params['outpath']):
        cmd = ["mkdir", "-p", g_params['outpath']]
        try:
            subprocess.check_call(cmd)
        except subprocess.CalledProcessError, e:
            print e
            return 1
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = "./"
    outfile = ""
    idListFile = ""
    uniprotDBname = ""
    idList = []

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            idList.append(argv[i])
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile"]:
                (outfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-outpath", "--outpath"]:
                (outpath, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-l", "--l"]:
                (idListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-uniprotdb", "--uniprotdb"]:
                (uniprotDBname, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            idList.append(argv[i])
            i += 1

    if idListFile != "":
        idList += myfunc.ReadIDList(idListFile)

    if uniprotDBname == "":
        print >> sys.stderr, "uniprotdb not set"
        return 1
    uniprotdbfile = "%s0.db" % uniprotDBname
    if myfunc.checkfile(uniprotdbfile, "uniprotdbfile") != 0:
        return 1

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    hdl = myfunc.MyDB(uniprotDBname)
    if hdl.failure:
        return 1

    for seqid in idList:
        data = hdl.GetRecord(seqid)
        if data != None:
            goinfo = GetGOInfoFromUniprotData(data)
            WriteGOInfo(seqid, goinfo, fpout)
    hdl.close()
    myfunc.myclose(fpout)
def main(g_params):#{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    infile = ""
    seqdb = ""
    listfile = ""

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile", "--outfile"]:
                outfile = argv[i+1]
                i += 2
            elif argv[i] in ["-i", "--i"] :
                infile = argv[i+1]
                i += 2
            elif argv[i] in ["-l", "--l"] :
                listfile = argv[i+1]
                i += 2
            elif argv[i] in ["-seqdb", "--seqdb"] :
                seqdb = argv[i+1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1

    runList = [] # runList is a list of tuples (infile, outfile)


    if infile != "":
        runList.append((infile, outfile))
    if listfile != "":
        if os.path.exists(listfile):
            try:
                fpin = open(listfile, "rU")
                lines = fpin.readlines()
                fpin.close()
                for line in lines:
                    if not line or line[0]== "#":
                        continue
                    strs = line.split("\t")
                    infile = strs[0].strip()
                    try:
                        outfile = strs[1].strip()
                    except IndexError:
                        outfile = ""
                    runList.append((infile, outfile))
            except IOError:
                print >> sys.stderr, "Failed to read file %s"%(listfile)
        else:
            print >> sys.stderr, "listfile %s does not exist"%(listfile)

    numInput = len(runList)
    if numInput < 1:
        print >> sys.stderr, "No input set. Exit"
        return 1

    if seqdb == "":
        print >> sys.stderr, "seqdb not set."
        return 1
    elif not os.path.exists(seqdb+"0.db"):
        print >> sys.stderr, "seqdb %s does not exist."%(seqdb)
        return 1

    hdl_seqdb = myfunc.MyDB(seqdb)
    if hdl_seqdb.failure:
        print >> sys.stderr, "Failed to open seqdb %s"%(seqdb)
        return 1

    for (infile, outfile) in runList:
        GetSeqFromMSA(infile, outfile, hdl_seqdb)

    hdl_seqdb.close()
    
    return 0
Esempio n. 9
0
def main():  #{{{
    if 0:  #{{{
        strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo"
        strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo"
        strProtein1 = "id1"
        strProtein2 = "id2"
        fpLog = sys.stdout
        class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew(
            strTop1, strTop2, strProtein1, strProtein2, fpLog)
        # Note: calling the int, float, string will not change their original value
        # calling the dict, list will change their original value
        print "strTop1:", strTop1
        print "strTop2:", strTop2
#}}}
    if 0:  #{{{
        PrintFuncName()
        print("this file name is: %s" % __file__)
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        lines = fp.readlines()
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        BLOCK_SIZE = 100000
        fp = open(filename, "r")
        buff = fp.read(BLOCK_SIZE)
        while buff:
            buff = fp.read(BLOCK_SIZE)
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        line = fp.readline()
        while line:
            line = fp.readline()
        fp.close()
        #}}}
    if 0:  #{{{
        try:
            BLOCK_SIZE = 100000
            infile = sys.argv[1]
            fpin = open(infile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                if len(recordList) > 0:
                    for record in recordList:
                        sys.stdout.write(">%s\n" % record[1])
                        sys.stdout.write("%s\n" % record[2])
                if isEOFreached == True:
                    break
            fpin.close()
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        try:
            infile = sys.argv[1]
            (annoList, seqList) = myfunc.ReadFasta_without_id(infile)
            for i in xrange(len(seqList)):
                sys.stdout.write(">%s\n" % annoList[i])
                sys.stdout.write("%s\n" % seqList[i])
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr"
        if IsDuplicatedByHHSearch(hhrfile):
            print "yes"

#}}}
    if 0:  #{{{
        import pairlistwithfamid2pairaln_by_msa
        seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV"
        seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP"
        seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment(
            seq1, seq2)
        print alignFactor
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{
        import my_extractdb
        #miniking my_extractdb.py see which one is faster
        try:
            dbname = sys.argv[1]
            idlistfile = sys.argv[2]
            cls = myfunc.MyDB(dbname)
            if cls.failure:
                print >> sys.stderr, "MyDB init failed"
            else:
                idlist = open(idlistfile, "r").read().split("\n")
                fpout = sys.stdout
                for seqid in idlist:
                    if seqid:
                        record = cls.GetRecord(seqid)
                        fpout.write(record)
            #             for rd in  cls.GetAllRecord():
            #                 print rd
#                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
#                print (seqid, anno, seq)
        except IndexError:
            print "error"
            pass
#}}}
    if 0:  #{{{ #test ReadLineByBlock
        try:
            infile = sys.argv[1]
            from myfunc import ReadLineByBlock
            cls = ReadLineByBlock(infile)
            lines = cls.readlines()
            while lines != None:
                for line in lines:
                    print line
                lines = cls.readlines()

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test speed of ReadLineByBlock
        # ReadLineByBlock is about 3 times fater than file.readline()
        try:
            from myfunc import ReadLineByBlock
            infile = sys.argv[1]

            start = time.time()
            hdl = ReadLineByBlock(infile)
            lines = hdl.readlines()
            while lines != None:
                lines = hdl.readlines()
            hdl.close()
            end = time.time()
            msg = "Reading %s by ReadLineByBlock costs %.3fs seconds"
            print msg % (infile, (end - start))

            start = time.time()
            hdl = open(infile, "r")
            line = hdl.readline()
            while line:
                line = hdl.readline()
            hdl.close()
            end = time.time()
            msg = "Reading %s by readline() costs %.3fs seconds"
            print msg % (infile, (end - start))

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test readline
        try:
            infile = sys.argv[1]
            fp = open(infile, "r")
            line = fp.readline()
            while line:
                print line
                line = fp.readline()
            fp.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test the speed of GetFirstWord
        try:
            nloop = int(sys.argv[1])
            string = "kjdafk jasdfj j"
            #string = "askdf askdf "
            #            string = "kajsdfasdfsdfjakasjdfka"
            #            string = "kajsdfasdf,sdfjakasjdfka"
            delimiter = " \t\r,.\n"
            delimiter = " "
            for i in xrange(nloop):
                #firstword = myfunc.GetFirstWord(string, delimiter)
                #firstword = string.split()[0]
                #firstword = string.partition(" ")[0]
                firstword = myfunc.GetFirstWord(string)
                #pass
                #print firstword
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ # read seq by SeqIO
        from Bio import SeqIO
        try:
            seqfile = sys.argv[1]
            # 1. SeqIO ####################
            start = time.time()
            handle = open(seqfile, "rU")
            cnt = 0
            for record in SeqIO.parse(handle, "fasta"):
                cnt += 1
            handle.close()
            end = time.time()
            msg = "Reading %d sequences by SeqIO costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 2. ReadFasta ####################
            start = time.time()
            seqfile = sys.argv[1]
            (idList, annoList, seqList) = myfunc.ReadFasta(seqfile)
            end = time.time()
            msg = "Reading %d sequences by ReadFasta costs %.3fs seconds"
            print msg % (len(idList), (end - start))

            # 3. ReadFasta from buffer
            BLOCK_SIZE = 100000
            start = time.time()
            cnt = 0
            fpin = open(seqfile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                cnt += len(recordList)
                if isEOFreached == True:
                    break
            fpin.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 4. ReadFastaByBlock ####################
            start = time.time()
            seqfile = sys.argv[1]
            hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0)
            if hdl.failure:
                print >> sys.stderr, "Failed to init ReadFastaByBlock"
                return 1
            recordList = hdl.readseq()
            cnt = 0
            while recordList != None:
                cnt += len(recordList)
                #                 for rd in recordList:
                #                     print ">%s"%rd.description
                #                     print rd.seq
                recordList = hdl.readseq()
            hdl.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds"
            print msg % (cnt, (end - start))
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ #test RemoveUnnecessaryGap
        try:
            infile = sys.argv[1]
            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)
            seqList = lcmp.RemoveUnnecessaryGap_old(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)

            seqList = lcmp.RemoveUnnecessaryGap(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test ReadMPAByBlock
        try:
            infile = sys.argv[1]
            hdl = myfunc.ReadMPAByBlock(infile)
            if hdl.failure:
                return
            recordList = hdl.readseq()
            while recordList != None:
                for rd in recordList:
                    #print rd.seqid
                    print ">%s" % (rd.description)
                    print "%s" % (myfunc.mpa2seq(rd.mpa))
                recordList = hdl.readseq()
            hdl.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{ #test subprocess
        import glob
        #invoke shell explicitly, not very good, may have security problems
        subprocess.call("seq 10", shell=True)
        subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True)
        subprocess.call("ls topo*.py", shell=True)
    if 1:  #{{{ #test subprocess
        import glob
        #invoke shell implicitly, recommended way
        subprocess.call(["seq", "10"], shell=False)
        subprocess.call(["echo", "wait for 1 seconds..."])
        subprocess.call(["sleep", "1"])
        try:
            print subprocess.check_call(["ls",
                                         "topo*.py"])  #This will not work
        except subprocess.CalledProcessError, e:
            print "error message:", e
        subprocess.call(["ls"] + glob.glob("topo*.py"))
Esempio n. 10
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    outfile_tableinfo = ""
    outfile_stat = ""
    fileListFile = ""
    fileList = []
    seqdb = ""
    evalue_threshold = 1e-3
    coverage_threshold = 0.0

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            fileList.append(argv[i])
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile"]:
                (outfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-ot", "--ot"]:
                (outfile_tableinfo, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-os", "--os"]:
                (outfile_stat, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-evalue", "--evalue"]:
                (evalue_threshold, i) = myfunc.my_getopt_float(argv, i)
            elif argv[i] in ["-coverage", "--coverage"]:
                (coverage_threshold, i) = myfunc.my_getopt_float(argv, i)
            elif argv[i] in ["-seqdb", "--seqdb"]:
                (seqdb, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-l", "--l"]:
                (fileListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            fileList.append(argv[i])
            i += 1

    if fileListFile != "":
        fileList += myfunc.ReadIDList(fileListFile)
    if len(fileList) < 1:
        print >> sys.stderr, "No input set. exit"
        return 1
    if seqdb == "":
        print >> sys.stderr, "Seqdb not set. exit"
        return 1

    hdl_seq = myfunc.MyDB(seqdb)
    if hdl_seq.failure:
        return 1

    fpout = sys.stdout
    fpout_tableinfo = None
    fpout_stat = None
    if outfile != "":
        fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    if outfile_tableinfo != "":
        fpout_tableinfo = myfunc.myopen(outfile_tableinfo, None, "w", False)
    if outfile_stat != "":
        fpout_stat = myfunc.myopen(outfile_stat, None, "w", False)

    if fpout_stat != None:
        fpout_stat.write("%-8s %-8s %7s %8s %6s %6s %6s %9s %4s %9s %4s\n" % (
            "#ID1",
            "ID2",
            "Evalue",
            "Coverage",
            "IDT",
            "Prob",
            "AlnCol",
            "PosQuery",
            "LenQ",
            "PosTemp",
            "LenT",
        ))
    if fpout_tableinfo != None:
        fpout_tableinfo.write(
            "#%-15s %-15s %6s %6s %9s %6s %6s %9s %6s %6s %6s %6s %6s\n" %
            ("Seq1", "Seq2", "IDT0", "SIM0", "AlnLength", "Len1", "Len2",
             "Score", "N_IDT", "N_SIM", "N_GAP", "IDT1", "IDT2"))

    for infile in fileList:
        HHAlign2Pairaln(infile, evalue_threshold, coverage_threshold, hdl_seq,
                        fpout, fpout_tableinfo, fpout_stat)

    if outfile != "":
        myfunc.myclose(fpout)
    if outfile_tableinfo != "":
        myfunc.myclose(fpout_tableinfo)
    if outfile_stat != "":
        myfunc.myclose(fpout_stat)
Esempio n. 11
0
        try:
            subprocess.check_call(["mkdir", "-p", tmpdir])
        except subprocess.CalledProcessError, e:
            return 1

#     famid2seqidDict = myfunc.ReadFam2SeqidMap(mapfile)

    g_params['cdhit_wordsize'] = GetCDHitWordSize(g_params['nrlevel'])

    seqdbDict = None
    hdl_seqdb = None

    if g_params['isBigmem']:
        seqdbDict = ReadSeqDBDict(seqdb)
    else:
        hdl_seqdb = myfunc.MyDB(seqdb)
        if hdl_seqdb.failure:
            print >> sys.stderr, "Failed to load seqdb %s. exit" % (seqdb)
            return 1

    pfamidList = []

    extra_desp_dict = {}
    if extra_description_file != "":
        hdl_extra = myfunc.ReadLineByBlock(extra_description_file)
        if hdl_extra.failure:
            print >> sys.stderr, "Failed to read extra_description_file %s." % (
                extra_description_file)
            return 1
        lines = hdl_extra.readlines()
        while lines != None:
Esempio n. 12
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = ""
    pairListFile = ""
    seqlenFile = ""
    shortid2fullidFile = ""
    seqid2pfamidMapFile = ""
    pfamDefFile = '/data3/data/pfam/pfam27.0/Pfam-A.clans.tsv'
    topodb = ""
    seqdb = ""
    pdb2spFile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            isNonOptionArg = False
            i += 1
            return 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outpath", "--outpath"]:
                (outpath, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-topodb", "--topodb"]:
                (topodb, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pdb2sp", "-pdb2sp", "-pdbtosp", "--pdbtosp"]:
                (pdb2spFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqdb", "--seqdb"]:
                (seqdb, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqmsapath", "--seqmsapath"]:
                (g_params['seqmsapath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-datapath", "--datapath"]:
                (g_params['datapath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seq2pfam", "--seq2pfam"]:
                (seqid2pfamidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfam2seq", "--pfam2seq"]:
                (pfamid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-description", "--description"]:
                (g_params['description'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfamdef", "--pfamdef"]:
                (pfamDefFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-alignrange", "--alignrange"]:
                g_params['alignrange'], i = myfunc.my_getopt_str(argv, i)
                if not g_params['alignrange'] in ['all', 'full', 'part']:
                    print >> sys.stderr, "alignrange must be one of [all, full, part]"
                    return 1
                else:
                    if g_params['alignrange'] == 'full':
                        g_params['alignrange'] = 'FULL_ALIGNED'
                    elif g_params['alignrange'] == 'part':
                        g_params['alignrange'] = 'PART_ALIGNED'
            elif argv[i] in ["-basename", "--basename"]:
                (g_params['basename'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-treepath", "--treepath"]:
                (g_params['treepath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pairalnpath", "--pairalnpath"]:
                (g_params['pairalnpath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-maxperfamily", "--maxperfamily"]:
                (g_params['max_num_output_per_family'],
                 i) = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-min-seqidt", "--min-seqidt"]:
                g_params['minSeqIDT'], i = myfunc.my_getopt_float(argv, i)
            elif argv[i] in ["-max-seqidt", "--max-seqidt"]:
                g_params['maxSeqIDT'], i = myfunc.my_getopt_float(argv, i)
            elif argv[i] in ["-shortid2fullid", "--shortid2fullid"]:
                (shortid2fullidFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-debug", "--debug"]:
                if argv[i + 1][0].lower() == 'y':
                    g_params['isDEBUG'] = True
                else:
                    g_params['isDEBUG'] = False
                i += 2
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            print >> sys.stderr, "Error! Wrong argument:", argv[i]
            return 1

    if g_params['basename'] == "":
        print >> sys.stderr, "basename not set. exit"
        return 1
    if myfunc.checkfile(g_params['datapath'], "datapath") != 0:
        return 1
    if myfunc.checkfile(seqid2pfamidMapFile, "seqid2pfamidMapFile") != 0:
        return 1
    if myfunc.checkfile(pfamid2seqidMapFile, "pfamid2seqidMapFile") != 0:
        return 1

    if myfunc.checkfile(topodb + "0.db", "topodb") != 0:
        return 1
    if myfunc.checkfile(seqdb + "0.db", "seqdb") != 0:
        return 1
    if myfunc.checkfile(g_params['seqmsapath'], "seqmsapath") != 0:
        return 1

    if pdb2spFile != "":
        (g_params['pdb2uniprotMap'],
         g_params['uniprot2pdbMap']) = myfunc.ReadPDBTOSP(pdb2spFile)

    if g_params['datapath'] == "":
        print >> sys.stderr, "datapath not set"
        return 1
    elif not os.path.exists(g_params['datapath']):
        print >> sys.stderr, "datapath %s does not exist" % (
            g_params['datapath'])
        return 1

    if outpath == "":
        print >> sys.stderr, "outpath not set"
        return 1
    elif not os.path.exists(outpath):
        cmd = ["mkdir", "-p", outpath]
        subprocess.check_call(cmd)

    paircmpfile = "%s/%s.paircmp" % (g_params['datapath'],
                                     g_params['basename'])
    if myfunc.checkfile(paircmpfile, "paircmpfile") != 0:
        return 1

    (g_params['pfamidDefDict'],
     g_params['clanidDefDict']) = lcmp.ReadPfamDefFile(pfamDefFile)

    g_params['seqid2pfamidDict'] = myfunc.ReadFam2SeqidMap(seqid2pfamidMapFile)
    g_params['pfamid2seqidDict'] = myfunc.ReadFam2SeqidMap(pfamid2seqidMapFile)

    tmpdir = tempfile.mkdtemp()
    if g_params['msapath'] == "":
        g_params['msapath'] = tmpdir
    if g_params['treepath'] == "":
        g_params['treepath'] = tmpdir
    if g_params['pairalnpath'] == "":
        g_params['pairalnpath'] = tmpdir

    pairCmpRecordList = []
    unprocessedBuffer = ""
    cntTotalReadInRecord = 0
    cntTotalOutputRecord = 0
    isEOFreached = False
    try:
        fpin = open(paircmpfile, "r")
    except IOError:
        print >> sys.stderr, "Failed to open input file %s" % (paircmpfile)
        return 1
    while 1:
        buff = fpin.read(myfunc.BLOCK_SIZE)
        if buff == "":
            isEOFreached = True
        buff = unprocessedBuffer + buff
        rdList = []
        unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff, rdList)
        rdList = FilterPairCmpResult(rdList)
        cntTotalReadInRecord += len(rdList)
        pairCmpRecordList += rdList
        if isEOFreached == True:
            break
    fpin.close()

    print "cntTotalReadInRecord =", cntTotalReadInRecord

    g_params['hdl_seqdb'] = myfunc.MyDB(seqdb)
    g_params['hdl_topodb'] = myfunc.MyDB(topodb)

    g_params['OS'] = os.uname()[0]
    if g_params['OS'].find('Linux') != -1:
        g_params['CP_EXE'] = "/bin/cp -uf"
    else:
        g_params['CP_EXE'] = "/bin/cp -f"

    if shortid2fullidFile != "":
        g_params['uniprotAC2FullSeqIDMap'] = myfunc.ReadID2IDMap(
            shortid2fullidFile)

    addname = ""
    if g_params['alignrange'] != 'all':
        addname += ".%s" % (g_params['alignrange'])

    dataTable = {}
    # structure of dataTable
    #    dataTable[pfamid] = {'set_seqid':set(), 'difftopopair':[{'INV':[(id1,id2)]},{'TM2GAP':},{}}

    # first read in pairCmpRecordList
    AddAllSeqInPairCmp(dataTable, pairCmpRecordList,
                       g_params['seqid2pfamidDict'])

    pairInfoFileList = []
    for cmpclass in g_params['cmpClassList_mp3_cmpdup'][0:]:
        ss = "%s/%s_.cmpdup.FULL_ALIGNED.%s.pairinfo.txt" % (
            g_params['datapath'], g_params['basename'], cmpclass)
        pairInfoFileList.append(ss)
        pairinfoList = ReadPairInfo_cmpclass(ss)
        AddPairInfo(dataTable, pairinfoList, cmpclass)
#     print "\n".join(pairInfoFileList)
    if g_params['isDEBUG']:  #{{{
        for pfamid in dataTable:
            print pfamid
            print "\tset_seqid"
            print dataTable[pfamid]['set_seqid']
            print "\tdifftopopair"
            for cls in dataTable[pfamid]['difftopopair']:
                print "\t\t", cls
                for tup in dataTable[pfamid]['difftopopair'][cls]:
                    print "\t\t\t", tup  #}}}

    WriteHTML(dataTable, outpath)

    os.system("rm -rf %s" % (tmpdir))