Beispiel #1
0
def Ana_NumContUnmappedTM(infile):
    methodList = [0,1,2,3]
    outpath = os.path.dirname(infile)
    if outpath == "":
        outpath = "."
    try:
        freqDict = {}
        for method in methodList:
            freqDict[2*method] = {}
            freqDict[2*method+1] = {}
        unprocessedBuffer=""
        cntTotalReadInRecord = 0
        cntTotalOutputRecord = 0
        isEOFreached = False
        fpin = open(infile)
        while 1:
            buff = fpin.read(BLOCK_SIZE)
            if buff == "":
                isEOFreached = True
            buff = unprocessedBuffer + buff
            pairCmpRecordList=[]
            unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff,pairCmpRecordList)
            if len(pairCmpRecordList) > 0: 
                CountContinuousUnmappedTM_0(pairCmpRecordList, freqDict[0],
                        isFilterNeighbour=True)
                CountContinuousUnmappedTM_0(pairCmpRecordList, freqDict[1],
                        isFilterNeighbour=False)
                CountContinuousUnmappedTM_1(pairCmpRecordList, freqDict[2],
                        isFilterNeighbour=True)
                CountContinuousUnmappedTM_1(pairCmpRecordList, freqDict[3],
                        isFilterNeighbour=False)
                CountContinuousUnmappedTM_2(pairCmpRecordList, freqDict[4],
                        isFilterNeighbour=True)
                CountContinuousUnmappedTM_2(pairCmpRecordList, freqDict[5],
                        isFilterNeighbour=False)
                CountContinuousUnmappedTM_3(pairCmpRecordList, freqDict[6],
                        isFilterNeighbour=True)
                CountContinuousUnmappedTM_3(pairCmpRecordList, freqDict[7],
                        isFilterNeighbour=False)
                cntTotalReadInRecord += len(pairCmpRecordList)
            if isEOFreached == True:
                break
        fpin.close()
        for method in methodList:
            for idx in [2*method, 2*method+1]:
                if idx == 2*method:
                    str_filter = "True"
                    outfile = (outpath + os.sep +
                            "tmp_ana_numContTM_method%d_filternb.txt" %
                            (method))
                else:
                    str_filter = "False"
                    outfile = (outpath + os.sep +
                            "tmp_ana_numContTM_method%d_nonfilternb.txt" %
                            (method))
                fpout = open(outfile, "w")
                print
                print >> fpout, "#numTM count Method_%d isFilterNeighbour=%s"%(method,
                        str_filter)
                for i in range(1, 21):
                    msg = "%-5d %5d"
                    try:
                        print >> fpout, msg%(i, freqDict[idx][i])
                    except KeyError:
                        print >> fpout, msg%(i, 0)
                fpout.close()
                cmd = "/data3/wk/MPTopo/src/tmp_plot_histogram_logscale.sh %s"
                os.system(cmd%(outfile))
    except IOError:
        return 1
Beispiel #2
0
def main():  #{{{
    numArgv = len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    parameters = {}
    parameters['minGapFraction'] = 0.5
    parameters['maxGapFraction'] = 1.0
    parameters['minDGvalue'] = -999999.0
    parameters['maxDGvalue'] = 1.0

    infile = ""
    outfile = ""
    isQuiet = False

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = sys.argv[i]
            isNonOptionArg = False
            i += 1
        elif sys.argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] == "-h" or sys.argv[i] == "--help":
                PrintHelp()
                sys.exit()
            elif (sys.argv[i] == '-o' or sys.argv[i] == '--o'
                  or sys.argv[i] == "-outfile" or sys.argv[i] == "--outfile"):
                outfile = sys.argv[i + 1]
                i += 2
            elif sys.argv[i] == "-gap" or sys.argv[i] == "--gap":
                parameters['minGapFraction'] = float(sys.argv[i + 1])
                i += 2
            elif sys.argv[i] == "-dg" or sys.argv[i] == "--dg":
                parameters['maxDGvalue'] = float(sys.argv[i + 1])
                i += 2
            elif sys.argv[i] == "-q":
                isQuiet = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", sys.argv[i]
                return -1
        else:
            infile = sys.argv[i]
            i += 1
    if infile == "":
        print >> sys.stderr, "infile not set. Exit."
        return -1
    elif not os.path.exists(infile):
        print >> sys.stderr, "infile %s does not exists. Exit." % infile

    rootname = os.path.basename(os.path.splitext(infile)[0])
    fpout = sys.stdout
    if outfile != "":
        try:
            fpout = open(outfile, "w")
        except IOError:
            print >> sys.stderr, "Failed to write to file %s." % outfile
            print >> sys.stderr, "Reset output to sys.stdout."
            fpout = sys.stdout
            pass

    fpin = open(infile, "rb")
    if not fpin:
        print >> sys.stderr, "Failed to open input file %s" % (infile)
        return -1

    unprocessedBuffer = ""
    cntTotalOutputRecord = 0
    cntTotalReadInRecord = 0
    isEOFreached = False
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if buff == "":
            isEOFreached = True
        buff = unprocessedBuffer + buff
        pairCmpRecordList = []
        unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(
            buff, pairCmpRecordList)
        if len(pairCmpRecordList) > 0:
            #WritePairCmpRecord(pairCmpRecordList,fpout);
            filteredList = FilterPairCmpResult(pairCmpRecordList, parameters)
            (status, cntTotalOutputRecord) = lcmp.WritePairCmpRecord(
                filteredList, cntTotalOutputRecord, fpout)
            cntTotalReadInRecord += len(pairCmpRecordList)
        if isEOFreached == True:
            break

    fpin.close()
    print "cntTotalReadInRecord =", cntTotalReadInRecord
    print "cntTotalOutputRecord =", cntTotalOutputRecord

    if fpout != None and fpout != sys.stdout:
        fpout.close()

    return 0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    infile = ""
    outpath = "./"
    isQuiet = False
    tableinfoFile = ""
    cmpclassList = []
    restrictIDListFile = ""

    signalpFile = ""
    dupFile = ""
    outfile = ""
    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = sys.argv[i]
            isNonOptionArg = False
            i += 1
        elif sys.argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif sys.argv[i][0] == "-":
            if sys.argv[i] in ["-h", "--help"]:
                PrintHelp()
                sys.exit()
            elif argv[i] in ["-o", "--o"]:
                (outfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-cmpclass", "--cmpclass"]:
                (tmpstr, i) = myfunc.my_getopt_str(argv, i)
                cmpclassList.append(tmpstr)
            elif argv[i] in ["-signalp", "--signalp"]:
                (signalpFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-restrictidlist", "--restrictidlist"]:
                (restrictIDListFile, i) = myfunc.my_getopt_str(argv, i)
                g_params['isRestrictIDListSet'] = True
            elif argv[i] in ["-dup", "--dup", "-dupfile", "--dupfile"]:
                (dupFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-rmsp", "--rmsp"]:
                g_params['isRemoveSignalP'] = True
                i += 1
            elif argv[i] in ["-rmdup", "--rmdup"]:
                g_params['isRemoveDup'] = True
                i += 1
            elif argv[i] in ["-seq2fammap", "--seq2fammap"]:
                (seq2famMapfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqidttype", "--seqidttype"]:
                g_params['seqidttype'], i = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-tableinfo", "--tableinfo"]:
                tableinfoFile, i = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-min-seqidt", "--min-seqidt"]:
                g_params['minSeqIDT'], i = myfunc.my_getopt_float(argv, i)
            elif argv[i] in ["-max-seqidt", "--max-seqidt"]:
                g_params['maxSeqIDT'], i = myfunc.my_getopt_float(argv, i)
            elif argv[i] in ["-evodist", "--evodist"]:
                g_params['isEvodist'] = True
                i += 1
            elif argv[i] in ["-alignrange", "--alignrange"]:
                g_params['alignrange'], i = myfunc.my_getopt_str(argv, i)
                if not g_params['alignrange'] in ['all', 'full', 'part']:
                    print >> sys.stderr, "alignrange must be one of [all, full, part]"
                    return 1
                else:
                    if g_params['alignrange'] == 'full':
                        g_params['alignrange'] = 'FULL_ALIGNED'
                    elif g_params['alignrange'] == 'part':
                        g_params['alignrange'] = 'PART_ALIGNED'
            elif argv[i] in ["-debug", "--debug"]:
                if argv[i + 1][0].lower() == 'y':
                    g_params['isDEBUG'] = True
                else:
                    g_params['isDEBUG'] = False
                i += 2
            elif argv[i] in [
                    "-debug-unmapped-position", "--debug-unmapped-position"
            ]:
                DEBUG_UNMAPPED_TM_POSITION = 1
                i += 2
            elif sys.argv[i] == "-q":
                isQuiet = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", sys.argv[i]
                return -1
        else:
            infile = sys.argv[i]
            i += 1
    if infile == "":
        print >> sys.stderr, "infile not set. Exit."
        return -1
    elif not os.path.exists(infile):
        print >> sys.stderr, "infile %s does not exists. Exit." % infile

    try:
        fpin = open(infile, "rb")
    except IOError:
        print >> sys.stderr, "Failed to open input file %s" % (infile)
        return -1

    pairalnStat = {}
    if g_params['seqidttype'] != 0:
        if tableinfoFile == "" or not os.path.exists(tableinfoFile):
            print >> sys.stderr, "tableinfoFile must be set when seqidttype is set to 1 or 2"
            print >> sys.stderr, "but seqidttype = %d is set. Exit." % g_params[
                'seqidttype']
            return -1
        pairalnStat = lcmp.ReadPairAlnTableInfo(tableinfoFile)

    rootname = os.path.basename(os.path.splitext(infile)[0])

    binpath = os.path.dirname(sys.argv[0])

    signalpDict = {}
    if signalpFile != "":
        signalpDict = lcmp.ReadSignalPDict(signalpFile)
    if signalpDict != {}:
        g_params['isSignalPSet'] = True

    dupPairList = []
    if dupFile != "":
        dupPairList = lcmp.ReadDupPairList(dupFile)
    if len(dupPairList) > 0:
        g_params['isDupSet'] = True
    dupPairSet = set(dupPairList)

    restrictIDSet = set([])
    if restrictIDListFile != "":
        restrictIDSet = set(myfunc.ReadIDList(restrictIDListFile))

    rltyDict = {}
    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    unprocessedBuffer = ""
    cntTotalReadInRecord = 0
    cntTotalOutputRecord = 0
    isEOFreached = False
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if buff == "":
            isEOFreached = True
        buff = unprocessedBuffer + buff
        pairCmpRecordList = []
        unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(
            buff, pairCmpRecordList)

        AddTableInfo(pairCmpRecordList, pairalnStat)
        AddSignalPInfo(pairCmpRecordList, signalpDict)
        AddDupInfo(pairCmpRecordList, dupPairSet)

        cntTotalReadInRecord += len(pairCmpRecordList)
        pairCmpRecordList = FilterPairCmpResult(pairCmpRecordList,
                                                cmpclassList, rltyDict,
                                                restrictIDSet)

        if len(pairCmpRecordList) > 0:
            lcmp.WritePairCmpRecord(pairCmpRecordList, cntTotalOutputRecord,
                                    fpout)
            cntTotalOutputRecord += len(pairCmpRecordList)
        if isEOFreached == True:
            break
    fpin.close()

    print "cntTotalReadInRecord =", cntTotalReadInRecord
    print "cntTotalOutputRecord =", cntTotalOutputRecord
    myfunc.myclose(fpout)
    return 0
Beispiel #4
0
def Ana_NumTMHeatMap(
        infile,
        seqid2pfamidDict,
        seqid2clanidDict,  #{{{
        tm_pfamidSet,
        tm_clanidSet,
        pfamidDefDict,
        clanidDefDict,
        signalpDict,
        classList_TableNumTMHeatMap,
        SPE_PAIR_LIST,
        pfamid2seqidDict,
        clanid2seqidDict,
        idSet_TMpro,
        usedPfamIDSet,
        alignrange):

    dataTableNumTMHeatMap = {}
    InitTableNumTMHeatMap(dataTableNumTMHeatMap, classList_TableNumTMHeatMap,
                          100, SPE_PAIR_LIST)
    pairInfoListDict = {}
    for cls in classList_TableNumTMHeatMap:
        pairInfoListDict[cls] = []

    if g_params['outpath'] != "":
        outpath = g_params['outpath']
    else:
        outpath = os.path.dirname(infile)
        if outpath == "":
            outpath = "."

    try:
        unprocessedBuffer = ""
        cntTotalReadInRecord = 0
        cntTotalOutputRecord = 0
        isEOFreached = False
        fpin = open(infile)
        while 1:
            buff = fpin.read(BLOCK_SIZE)
            if buff == "":
                isEOFreached = True
            buff = unprocessedBuffer + buff
            pairCmpRecordList = []
            unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(
                buff, pairCmpRecordList)
            if len(pairCmpRecordList) > 0:
                #CountSpecialPair(pairCmpRecordList, pairInfoLists, SPE_PAIR_LIST)
                AnaPairCmpResultNumTMHeatMap(pairCmpRecordList,
                                             dataTableNumTMHeatMap,
                                             pairInfoListDict,
                                             classList_TableNumTMHeatMap,
                                             signalpDict, SPE_PAIR_LIST,
                                             alignrange)
                cntTotalReadInRecord += len(pairCmpRecordList)
                print "cntTotalReadInRecord = ", cntTotalReadInRecord
            if isEOFreached == True:
                break
        fpin.close()

        #print "count 6,12 = ", dataTableNumTMHeatMap['RMSP']['data'][6][12]
        #print "count = ", dataTableNumTMHeatMap['RMSP']['data']
        #print "count special", dataTableNumTMHeatMap['RMSP']['pairInfoLists']
        try:
            for i in xrange(len(
                    dataTableNumTMHeatMap['RMSP']['pairInfoLists'])):
                print SPE_PAIR_LIST[i], len(
                    dataTableNumTMHeatMap['RMSP']['pairInfoLists'][i])
        except KeyError:
            pass

#         if g_params['numTMHeatMapMode'] == "full":
#             FillSymmetricDataTableNumTMHeatMap(dataTableNumTMHeatMap,
#                     classList_TableNumTMHeatMap)

        for cls in classList_TableNumTMHeatMap:  # ["ALL", "RMSP"]
            (freqListPfam, freqListClan) = AnaFamFrequency_onelist(
                pairInfoListDict[cls], seqid2pfamidDict, seqid2clanidDict,
                pfamid2seqidDict, clanid2seqidDict, tm_pfamidSet, tm_clanidSet,
                idSet_TMpro, usedPfamIDSet)
            #print "tm_clanidSet", tm_clanidSet
            if g_params['pairwise_comparison_method'] == 1:
                cmpclassList = cmpClassList_method1
            elif g_params['pairwise_comparison_method'] == 3:
                cmpclassList = cmpClassList_method3
            isCmpDup = False
            outFileFamPairCount = "%s%s%s.%s.%s.pfam.paircount.txt" % (
                outpath, os.sep, g_params['outname'], alignrange, cls)
            WriteFamPairCount(freqListPfam, pairInfoListDict[cls],
                              pfamidDefDict, cmpclassList,
                              g_params['pairwise_comparison_method'], isCmpDup,
                              outFileFamPairCount)
            outFileFamPairCount = "%s%s%s.%s.%s.clan.paircount.txt" % (
                outpath, os.sep, g_params['outname'], alignrange, cls)
            WriteFamPairCount(freqListClan, pairInfoListDict[cls],
                              clanidDefDict, cmpclassList,
                              g_params['pairwise_comparison_method'], isCmpDup,
                              outFileFamPairCount)

            if g_params['pairwise_comparison_method'] == 3:
                # if mp=3, write another statistics with cmpdup
                isCmpDup = True
                cmpclassList = cmpClassList_mp3_cmpdup
                outFileFamPairCount = "%s%s%s.%s.%s.cmpdup.pfam.paircount.txt" % (
                    outpath, os.sep, g_params['outname'], alignrange, cls)
                WriteFamPairCount(freqListPfam, pairInfoListDict[cls],
                                  pfamidDefDict, cmpclassList,
                                  g_params['pairwise_comparison_method'],
                                  isCmpDup, outFileFamPairCount)
                outFileFamPairCount = "%s%s%s.%s.%s.cmpdup.clan.paircount.txt" % (
                    outpath, os.sep, g_params['outname'], alignrange, cls)
                WriteFamPairCount(freqListClan, pairInfoListDict[cls],
                                  clanidDefDict, cmpclassList,
                                  g_params['pairwise_comparison_method'],
                                  isCmpDup, outFileFamPairCount)

            for mode_norm in ["norm_diag", "no_norm"]:
                if mode_norm in ["norm_diag", "no_norm"]:
                    heatmapmode = 'half'
                else:
                    heatmapmode = 'full'
                outFileNumTMHeatMap = "%s%s%s.%s.%s.%s.%s.txt" % (
                    outpath, os.sep, g_params['outname'], alignrange,
                    heatmapmode, cls, mode_norm)
                if heatmapmode == 'full':
                    mtx = myfunc.FillSymmetricMatrix(
                        dataTableNumTMHeatMap[cls]['data'],
                        dataTableNumTMHeatMap[cls]['maxNumTM'])
                else:
                    mtx = dataTableNumTMHeatMap[cls]['data']

                if mode_norm == "no_norm":
                    for i in range(dataTableNumTMHeatMap[cls]['maxNumTM']):
                        mtx[i][i] = 0

                if WriteNumTMHeatMap(mtx,
                                     dataTableNumTMHeatMap[cls]['maxNumTM'],
                                     dataTableNumTMHeatMap[cls]['numPair'],
                                     mode_norm, outFileNumTMHeatMap) == 0:
                    print "heatmap %s output" % (outFileNumTMHeatMap)
                    cmd = "%s/plotNumTMHeatMap.sh %s" % (binpath,
                                                         outFileNumTMHeatMap)
                    os.system(cmd)
                outFileSpecialPairAna = "%s%s%s.%s.%s.%s.%s.specialpairana.txt" % (
                    outpath, os.sep, g_params['outname'], alignrange,
                    g_params['numTMHeatMapMode'], cls, mode_norm)
                WriteSpecialPair(dataTableNumTMHeatMap[cls],
                                 pairInfoListDict[cls], seqid2pfamidDict,
                                 seqid2clanidDict, tm_pfamidSet, tm_clanidSet,
                                 pfamidDefDict, clanidDefDict, SPE_PAIR_LIST,
                                 outFileSpecialPairAna)
                print "Anafile %s output" % (outFileSpecialPairAna)

#         for i in xrange(len(SPE_PAIR_LIST)):
#             print
#             pair = SPE_PAIR_LIST[i]
#             print pair
#             print len(pairInfoLists[i])
#             print pairInfoLists[i]

    except IOError:
        return 1
def main():#{{{
    numArgv=len(sys.argv)
    if numArgv < 2:
        PrintHelp()
        return 1;

    parameters={};
    parameters['minGapFraction'] = 0.5;
    parameters['maxGapFraction'] = 1.0;
    parameters['minDGvalue'] = -999999.0;
    parameters['maxDGvalue'] = 1.0;
    parameters['minSeqIDT'] = 0.0;
    parameters['maxSeqIDT'] = 100.0;

    infile="";
    outfile="";
    outPaircmpfile = "";
    pairalnTopoFile = "";
    isQuiet=False;

    i = 1;
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            infile=sys.argv[i];
            isNonOptionArg=False;
            i += 1;
        elif sys.argv[i] == "--":
            isNonOptionArg=True;
            i += 1;
        elif sys.argv[i][0] == "-":
            if sys.argv[i] ==  "-h" or  sys.argv[i] == "--help":
                PrintHelp();
                sys.exit();
            elif (sys.argv[i] == '-o' or sys.argv[i] == '--o' or sys.argv[i]
                    == "-outfile" or sys.argv[i] == "--outfile"):
                outfile=sys.argv[i+1];
                i += 2;
            elif sys.argv[i] == "-gap" or sys.argv[i] == "--gap":
                parameters['minGapFraction'] = float(sys.argv[i+1]);
                i += 2;
            elif sys.argv[i] == "-dg" or sys.argv[i] == "--dg":
                parameters['maxDGvalue'] = float(sys.argv[i+1]);
                i += 2;
            elif sys.argv[i] in ["-min-seqidt", "--min-seqidt"]:
                parameters['minSeqIDT'] = float(sys.argv[i+1]);
                i += 2;
            elif sys.argv[i] in ["-max-seqidt", "--max-seqidt"]:
                parameters['maxSeqIDT'] = float(sys.argv[i+1]);
                i += 2;
            elif sys.argv[i] in ["-write-paircmp", "--write-paircmp"]:
                outPaircmpfile = sys.argv[i+1];
                i += 2;
            elif sys.argv[i] in ["-aln", "--aln"]:
                pairalnTopoFile = sys.argv[i+1];
                i += 2;
            elif sys.argv[i] == "-q":
                isQuiet=True;
                i += 1;
            else:
                print >> sys.stderr, "Error! Wrong argument:", sys.argv[i];
                return -1;
        else:
            infile=sys.argv[i];
            i += 1
    if infile == "":
        print >> sys.stderr, "infile not set. Exit.";
        return -1;
    elif not os.path.exists(infile):
        print >> sys.stderr, "infile %s does not exists. Exit."%infile;

    if pairalnTopoFile == "":
        print >> sys.stderr, "pairalnTopoFile not set. Exit.";
        return -1;

    pairTopoAlnDict = GetPairTopoAln(pairalnTopoFile);
    # pairTopoAlnDict[id1-id2]['id1] ['id2'] ['anno1'] ['anno2] ['seq1']
    # ['seq2']

    rootname=os.path.basename(os.path.splitext(infile)[0]);
    fpout = sys.stdout;
    fppaircmp = None;

    if outPaircmpfile != "" :
        fppaircmp = open(outPaircmpfile, "w")


    if outfile != "":
        try:
            fpout = open(outfile,"w");
        except IOError:
            print >>sys.stderr, "Failed to write to file %s."%outfile;
            print >> sys.stderr, "Reset output to sys.stdout.";
            fpout = sys.stdout;
            pass;

    fpin = open (infile, "rb");
    if not fpin:
        print >> sys.stderr, "Failed to open input file %s"%(infile);
        return -1;

    unprocessedBuffer="";
    cntTotalOutputRecord = 0;
    cntTotalReadInRecord = 0;
    isEOFreached = False;
    while 1:
        buff = fpin.read(BLOCK_SIZE);
        if buff == "":
            isEOFreached = True;
        buff = unprocessedBuffer + buff;
        pairCmpRecordList=[];
        unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff,pairCmpRecordList);
        if len(pairCmpRecordList) > 0: 
            #WritePairCmpRecord(pairCmpRecordList,fpout);
           filteredList =  FilterPairCmpResult(pairCmpRecordList, parameters);
           for record in filteredList:
               if IsHasInternalVariation(record):
                   if fppaircmp != None:
                       li = [];
                       li.append(record);
                       (status, cntTotalOutputRecord ) = lcmp.WritePairCmpRecord(li, cntTotalOutputRecord, fppaircmp);
                   key = "%s-%s"%(record['id1'], record['id2']);
                   pair = pairTopoAlnDict[key];
                   fpout.write(">%s\n"%pair['anno1']);
                   fpout.write("%s\n"%pair['seq1']);
                   fpout.write(">%s\n"%pair['anno2']);
                   fpout.write("%s\n"%pair['seq2']);
           cntTotalReadInRecord += len(pairCmpRecordList);
        if isEOFreached == True:
            break;

    fpin.close();
    print "cntTotalReadInRecord =", cntTotalReadInRecord;
    print "cntTotalOutputRecord =", cntTotalOutputRecord;

    if fpout != None and fpout != sys.stdout:
        fpout.close();
    if fppaircmp != None:
        fppaircmp.close();

    return 0;
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = "./"
    dupfile = ""
    paircmpfile = ""
    outfile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outpath", "--outpath"]:
                outpath = argv[i + 1]
                i += 2
            elif argv[i] in ["-o", "--o"]:
                outfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-l", "--l"]:
                idListFile = argv[i + 1]
                i += 2
            elif argv[i] in ["-dup", "--dup"]:
                dupfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-paircmp", "--paircmp"]:
                paircmpfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            print >> sys.stderr, "Error! Wrong argument:", argv[i]
            return 1

    if paircmpfile == "":
        return 1
    if dupfile == "":
        return 1

    dupPairList = ReadDupPairList(dupfile)
    dupPairSet = set(dupPairList)
    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    fpin = open(paircmpfile, "r")
    unprocessedBuffer = ""
    cntTotalReadInRecord = 0
    cntTotalOutputRecord = 0
    isEOFreached = False
    while 1:
        buff = fpin.read(BLOCK_SIZE)
        if buff == "":
            isEOFreached = True
        buff = unprocessedBuffer + buff
        pairCmpRecordList = []
        unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(
            buff, pairCmpRecordList)

        if len(pairCmpRecordList) > 0:
            StatDupPaircmp(pairCmpRecordList, dupPairSet, fpout)
            cntTotalReadInRecord += len(pairCmpRecordList)
        if isEOFreached == True:
            break
    fpin.close()
    myfunc.myclose(fpout)
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1
    i = 1
    isNonOptionArg = False
    isPickOne = False
    paircmpFile = ""
    pfamACDEListFile = '/data3/data/pfam/pfamA.seed.ac-delist'
    seqDefFile = '/data3/wk/MPTopo/pfamAna/pfam2-selTM-giid-refseqid-pfamid-description.txt'
    outpath = ""
    htmlname = 'index'
    tableinfoFile = ""
    while i < numArgv:  #{{{
        if isNonOptionArg == True:
            paircmpFile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] == "-h" or argv[i] == "--help":
                PrintHelp()
                return 1
            elif argv[i] == "-outpath" or argv[i] == "--outpath":
                outpath = argv[i + 1]
                i += 2
            elif argv[i] == "-htmlname" or argv[i] == "--htmlname":
                htmlname = argv[i + 1]
                i += 2
            elif argv[i] == "-alnfigpath" or argv[i] == "--alnfigpath":
                g_params['alnFigPath'] = argv[i + 1]
                i += 2
            elif argv[i] == "-pfamdef" or argv[i] == "--pfamdef":
                pfamACDEListFile = argv[i + 1]
                i += 2
            elif argv[i] == "-msapath" or argv[i] == "--msapath":
                g_params['MSAPath'] = argv[i + 1]
                i += 2
            elif argv[i] == "-msapath2" or argv[i] == "--msapath2":
                g_params['MSAPath2'] = argv[i + 1]
                i += 2
            elif argv[i] == "-tableinfo" or argv[i] == "--tableinfo":
                tableinfoFile = argv[i + 1]
                i += 2
            elif argv[i] == "-seqdef" or argv[i] == "--seqdef":
                seqDefFile = argv[i + 1]
                i += 2
            elif argv[i] == "-gap" or argv[i] == "--gap":
                g_params['minGapFraction'] = float(argv[i + 1])
                i += 2
            elif argv[i] == "-dg" or argv[i] == "--dg":
                g_params['maxDGvalue'] = float(argv[i + 1])
                i += 2
            elif argv[i] in ["-min-seqidt", "--min-seqidt"]:
                g_params['minSeqIDT'] = float(argv[i + 1])
                i += 2
            elif argv[i] in ["-max-seqidt", "--max-seqidt"]:
                g_params['maxSeqIDT'] = float(argv[i + 1])
                i += 2
            elif argv[i] in ["-tableformat", "--tableformat"]:
                g_params['htmltableformat'] = int(argv[i + 1])
                i += 2
            elif argv[i] in ["-treepath", "--treepath"]:
                g_params['treepath'] = argv[i + 1]
                i += 2
            elif argv[i] in ["-ordermsapath", "--ordermsapath"]:
                g_params['ordermsapath'] = argv[i + 1]
                i += 2
            elif argv[i] in ["-topomsapath", "--topomsapath"]:
                g_params['topomsapath'] = argv[i + 1]
                i += 2
            elif argv[i] in ["-type", "--type"]:
                g_params['selecttype'] = argv[i + 1]
                i += 2
            elif argv[i] in ["-filter-predseq", "--filter-predseq"]:
                if argv[i + 1].lower()[0] == 'y':
                    g_params['isFilterPredictedSeq'] = True
                else:
                    g_params['isFilterPredictedSeq'] = False
                i += 2
            elif argv[i] == "-q":
                isQuiet = True
                i += 1
            elif argv[i] in ["-pickone", "--pickone"]:
                isPickOne = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            paircmpFile = argv[i]
            i += 1
#}}}
    g_params['outpath'] = outpath
    if not os.path.exists(outpath):
        os.system("mkdir -p %s" % outpath)

# read paircmprecordlist
    fpin = open(paircmpFile, 'r')
    buff = fpin.read()
    fpin.close()
    recordList = []
    unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff, recordList)
    print "len(recordList) =", len(recordList)

    seqIDListSet = set([])
    for record in recordList:
        seqIDListSet.add(record['id1'])
        seqIDListSet.add(record['id2'])
    print "len(seqIDListSet) =", len(seqIDListSet)

    # Read In pairwise alignment info
    pairalnStat = {}
    if tableinfoFile != "" and os.path.exists(tableinfoFile):
        pairalnStat = ReadInTableInfo(tableinfoFile)
        print "len(pairalnStat) =", len(pairalnStat)

#Read In pfamDefList
    if not os.path.exists(pfamACDEListFile):
        print >> sys.stderr, "Error! file pfamACDEListFile (%s) does not exist." % pfamACDEListFile
        return 1
    pfamDefDict = ReadPfamDEList(pfamACDEListFile)
    print 'len(pfamDefDict)=', len(pfamDefDict)

    #Read in seqinfoList
    seqInfoDict = {}
    if not os.path.exists(seqDefFile):
        print >> sys.stderr, "Error! file seqDefFile (%s) does not exist." % seqDefFile
        return 1
    fpin = open(seqDefFile, "r")
    line = fpin.readline()
    line = fpin.readline()
    while line:
        strs = line.split('|')
        if len(strs) == 4:
            gid = strs[0].strip()
            if gid in seqIDListSet:
                refseqid = strs[1].strip()
                pfamid = strs[2].strip()
                seqdef = strs[3].strip()
                seqInfoDict[gid] = {}
                seqInfoDict[gid]['pfamid'] = pfamid
                seqInfoDict[gid]['refseqid'] = refseqid
                seqInfoDict[gid]['seqdef'] = seqdef
                seqInfoDict[gid]['pfamdef'] = pfamDefDict[pfamid]
        line = fpin.readline()
    fpin.close()

    print 'len(seqInfoDict)=', len(seqInfoDict)

    # add tableinfo to record list
    print "Add pairwise alignment table info to record..."
    AddTableInfo(recordList, pairalnStat)
    print "Add seqdef to record ..."
    AddSeqDefInfo(recordList, seqInfoDict)

    filteredRecordList = FilterPairCmpResult(recordList)
    del recordList

    numFilteredRecordList = len(filteredRecordList)
    print "numFilteredRecordList = %d" % numFilteredRecordList

    # reorder list according to pfamid
    numPair = len(filteredRecordList)
    tupList = []  # list of (index - pfamid)
    for i in xrange(numPair):
        thisPfamid = seqInfoDict[filteredRecordList[i]['id1']]['pfamid']
        tupList.append((i, thisPfamid))
    sorted_by_pfamid = sorted(tupList, key=lambda tup: tup[1])

    pairCmpRecordList = []
    for i in xrange(numPair):
        pairCmpRecordList.append(filteredRecordList[sorted_by_pfamid[i][0]])

    if isPickOne:
        pairCmpRecordList = PickOnlyOneForEachPfam(pairCmpRecordList,
                                                   sorted_by_pfamid)

    print "len(pairCmpRecordList) = ", len(pairCmpRecordList)

    WriteHTML(pairCmpRecordList, seqInfoDict, htmlname, outpath)
    return 0
Beispiel #8
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = ""
    pairListFile = ""
    seqlenFile = ""
    shortid2fullidFile = ""
    seqid2pfamidMapFile = ""
    pfamDefFile = '/data3/data/pfam/pfam27.0/Pfam-A.clans.tsv'
    topodb = ""
    seqdb = ""
    pdb2spFile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            isNonOptionArg = False
            i += 1
            return 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outpath", "--outpath"]:
                (outpath, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-topodb", "--topodb"]:
                (topodb, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pdb2sp", "-pdb2sp", "-pdbtosp", "--pdbtosp"]:
                (pdb2spFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqdb", "--seqdb"]:
                (seqdb, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqmsapath", "--seqmsapath"]:
                (g_params['seqmsapath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-datapath", "--datapath"]:
                (g_params['datapath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seq2pfam", "--seq2pfam"]:
                (seqid2pfamidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfam2seq", "--pfam2seq"]:
                (pfamid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-description", "--description"]:
                (g_params['description'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfamdef", "--pfamdef"]:
                (pfamDefFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-alignrange", "--alignrange"]:
                g_params['alignrange'], i = myfunc.my_getopt_str(argv, i)
                if not g_params['alignrange'] in ['all', 'full', 'part']:
                    print >> sys.stderr, "alignrange must be one of [all, full, part]"
                    return 1
                else:
                    if g_params['alignrange'] == 'full':
                        g_params['alignrange'] = 'FULL_ALIGNED'
                    elif g_params['alignrange'] == 'part':
                        g_params['alignrange'] = 'PART_ALIGNED'
            elif argv[i] in ["-basename", "--basename"]:
                (g_params['basename'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-treepath", "--treepath"]:
                (g_params['treepath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pairalnpath", "--pairalnpath"]:
                (g_params['pairalnpath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-maxperfamily", "--maxperfamily"]:
                (g_params['max_num_output_per_family'],
                 i) = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-min-seqidt", "--min-seqidt"]:
                g_params['minSeqIDT'], i = myfunc.my_getopt_float(argv, i)
            elif argv[i] in ["-max-seqidt", "--max-seqidt"]:
                g_params['maxSeqIDT'], i = myfunc.my_getopt_float(argv, i)
            elif argv[i] in ["-shortid2fullid", "--shortid2fullid"]:
                (shortid2fullidFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-debug", "--debug"]:
                if argv[i + 1][0].lower() == 'y':
                    g_params['isDEBUG'] = True
                else:
                    g_params['isDEBUG'] = False
                i += 2
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            print >> sys.stderr, "Error! Wrong argument:", argv[i]
            return 1

    if g_params['basename'] == "":
        print >> sys.stderr, "basename not set. exit"
        return 1
    if myfunc.checkfile(g_params['datapath'], "datapath") != 0:
        return 1
    if myfunc.checkfile(seqid2pfamidMapFile, "seqid2pfamidMapFile") != 0:
        return 1
    if myfunc.checkfile(pfamid2seqidMapFile, "pfamid2seqidMapFile") != 0:
        return 1

    if myfunc.checkfile(topodb + "0.db", "topodb") != 0:
        return 1
    if myfunc.checkfile(seqdb + "0.db", "seqdb") != 0:
        return 1
    if myfunc.checkfile(g_params['seqmsapath'], "seqmsapath") != 0:
        return 1

    if pdb2spFile != "":
        (g_params['pdb2uniprotMap'],
         g_params['uniprot2pdbMap']) = myfunc.ReadPDBTOSP(pdb2spFile)

    if g_params['datapath'] == "":
        print >> sys.stderr, "datapath not set"
        return 1
    elif not os.path.exists(g_params['datapath']):
        print >> sys.stderr, "datapath %s does not exist" % (
            g_params['datapath'])
        return 1

    if outpath == "":
        print >> sys.stderr, "outpath not set"
        return 1
    elif not os.path.exists(outpath):
        cmd = ["mkdir", "-p", outpath]
        subprocess.check_call(cmd)

    paircmpfile = "%s/%s.paircmp" % (g_params['datapath'],
                                     g_params['basename'])
    if myfunc.checkfile(paircmpfile, "paircmpfile") != 0:
        return 1

    (g_params['pfamidDefDict'],
     g_params['clanidDefDict']) = lcmp.ReadPfamDefFile(pfamDefFile)

    g_params['seqid2pfamidDict'] = myfunc.ReadFam2SeqidMap(seqid2pfamidMapFile)
    g_params['pfamid2seqidDict'] = myfunc.ReadFam2SeqidMap(pfamid2seqidMapFile)

    tmpdir = tempfile.mkdtemp()
    if g_params['msapath'] == "":
        g_params['msapath'] = tmpdir
    if g_params['treepath'] == "":
        g_params['treepath'] = tmpdir
    if g_params['pairalnpath'] == "":
        g_params['pairalnpath'] = tmpdir

    pairCmpRecordList = []
    unprocessedBuffer = ""
    cntTotalReadInRecord = 0
    cntTotalOutputRecord = 0
    isEOFreached = False
    try:
        fpin = open(paircmpfile, "r")
    except IOError:
        print >> sys.stderr, "Failed to open input file %s" % (paircmpfile)
        return 1
    while 1:
        buff = fpin.read(myfunc.BLOCK_SIZE)
        if buff == "":
            isEOFreached = True
        buff = unprocessedBuffer + buff
        rdList = []
        unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff, rdList)
        rdList = FilterPairCmpResult(rdList)
        cntTotalReadInRecord += len(rdList)
        pairCmpRecordList += rdList
        if isEOFreached == True:
            break
    fpin.close()

    print "cntTotalReadInRecord =", cntTotalReadInRecord

    g_params['hdl_seqdb'] = myfunc.MyDB(seqdb)
    g_params['hdl_topodb'] = myfunc.MyDB(topodb)

    g_params['OS'] = os.uname()[0]
    if g_params['OS'].find('Linux') != -1:
        g_params['CP_EXE'] = "/bin/cp -uf"
    else:
        g_params['CP_EXE'] = "/bin/cp -f"

    if shortid2fullidFile != "":
        g_params['uniprotAC2FullSeqIDMap'] = myfunc.ReadID2IDMap(
            shortid2fullidFile)

    addname = ""
    if g_params['alignrange'] != 'all':
        addname += ".%s" % (g_params['alignrange'])

    dataTable = {}
    # structure of dataTable
    #    dataTable[pfamid] = {'set_seqid':set(), 'difftopopair':[{'INV':[(id1,id2)]},{'TM2GAP':},{}}

    # first read in pairCmpRecordList
    AddAllSeqInPairCmp(dataTable, pairCmpRecordList,
                       g_params['seqid2pfamidDict'])

    pairInfoFileList = []
    for cmpclass in g_params['cmpClassList_mp3_cmpdup'][0:]:
        ss = "%s/%s_.cmpdup.FULL_ALIGNED.%s.pairinfo.txt" % (
            g_params['datapath'], g_params['basename'], cmpclass)
        pairInfoFileList.append(ss)
        pairinfoList = ReadPairInfo_cmpclass(ss)
        AddPairInfo(dataTable, pairinfoList, cmpclass)
#     print "\n".join(pairInfoFileList)
    if g_params['isDEBUG']:  #{{{
        for pfamid in dataTable:
            print pfamid
            print "\tset_seqid"
            print dataTable[pfamid]['set_seqid']
            print "\tdifftopopair"
            for cls in dataTable[pfamid]['difftopopair']:
                print "\t\t", cls
                for tup in dataTable[pfamid]['difftopopair'][cls]:
                    print "\t\t\t", tup  #}}}

    WriteHTML(dataTable, outpath)

    os.system("rm -rf %s" % (tmpdir))