def main(g_params):#{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    infile = ""
    mapfile = ""

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile"]:
                (outfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-map", "--map", "-mapfile"]:
                (mapfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1
    if myfunc.checkfile(infile) != 0:
        return 1
    if myfunc.checkfile(mapfile) != 0:
        return 1

    clanid2pfamidDict = myfunc.ReadFam2SeqidMap(mapfile)
    pfamPercentTMDict = ReadPercentTM(infile)

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    GetPercentTMOfClan(pfamPercentTMDict, clanid2pfamidDict, fpout)
    myfunc.myclose(fpout)
Beispiel #2
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    outfile_with_famid = ""
    outfile_with_pdb = ""
    outfile_fam2seqmap = ""
    idListFile = ""
    mapfile = "%s%s%s" % (
        DATADIR3, os.sep,
        "wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.clanid2seqid"
    )
    restrictIDListFile = ""
    idList = []
    maxseq_for_fam = 200
    maxpair_for_fam = 300
    method = 0
    rand_seed = None
    pdbtospFile = ""
    isOnlyPDB = False

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            idList.append(argv[i])
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile", "--outfile"]:
                outfile, i = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-outwithfamid", "--outwithfamid"]:
                outfile_with_famid, i = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-outfam2seqmap", "--outfam2seqmap"]:
                outfile_fam2seqmap, i = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-outwithpdb", "--outwithpdb"]:
                outfile_with_pdb, i = myfunc.my_getopt_str(argv, i)
            elif argv[i] in [
                    "-tmprolist", "--tmprolist", "-restrictlist",
                    "--restrictlist"
            ]:
                restrictIDListFile, i = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-mapfile", "--mapfile"]:
                mapfile, i = myfunc.my_getopt_str(argv, i)
            elif (argv[i] in ["-pdbtosp", "--pdbtosp"]):
                pdbtospFile, i = myfunc.my_getopt_str(argv, i)
            elif sys.argv[i] in ["-seed", "--seed"]:
                rand_seed, i = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-l", "--l"]:
                idListFile, i = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-maxseq", "--maxseq"]:
                maxseq_for_fam, i = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-maxpair", "--maxpair"]:
                maxpair_for_fam, i = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-m", "--m", "-method", "--method"]:
                method, i = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            elif argv[i] in ["-onlypdb", "--onlypdb"]:
                g_params['isOnlyPDB'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            idList.append(argv[i])
            i += 1

    if os.path.exists(idListFile):
        idList += myfunc.ReadIDList(idListFile)

    if len(idList) < 1:
        print >> sys.stderr, "no ID set. exit"
        return 1
    if myfunc.checkfile(mapfile, "idMapFile") != 0:
        return 1

    idMapDict = myfunc.ReadFam2SeqidMap(mapfile)

    # Read in pdbtosp map
    if pdbtospFile != "":
        (pdb2uniprotMap, uniprot2pdbMap) =\
                myfunc.ReadPDBTOSP(pdbtospFile)
        g_params['uniprotidlist_with_pdb'] = set(uniprot2pdbMap.keys())
        g_params['uniprot2pdbMap'] = uniprot2pdbMap

    if g_params['isOnlyPDB'] == True:
        if pdbtospFile == "":
            print >> sys.stderr, "onlypdb is true but pdbtospFile is not set. exit."
            return 1
        elif g_params['uniprotidlist_with_pdb'] == set([]):
            print >> sys.stderr, "onlypdb is true but uniprotidlist_with_pdb is empty. exit."
            return 1

    restrictIDSet = set([])
    if restrictIDListFile != "":
        restrictIDSet = set(myfunc.ReadIDList(restrictIDListFile))

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    fpout_withfamid = myfunc.myopen(outfile_with_famid, None, "w", False)
    fpout_withpdb = myfunc.myopen(outfile_with_pdb, None, "w", False)
    fpout_fam2seqmap = myfunc.myopen(outfile_fam2seqmap, None, "w", False)

    if method == 0:
        GeneratePairWithinFam_m_0(idList, idMapDict, restrictIDSet,
                                  maxseq_for_fam, rand_seed, fpout,
                                  fpout_withfamid)
    elif method == 1:
        GeneratePairWithinFam_m_1(idList, idMapDict, restrictIDSet,
                                  maxpair_for_fam, rand_seed, fpout,
                                  fpout_withfamid, fpout_fam2seqmap)
    elif method == 2:  #all to all
        GeneratePairWithinFam_m_2(idList, idMapDict, restrictIDSet, fpout,
                                  fpout_withfamid, fpout_withpdb)

    myfunc.myclose(fpout)
    myfunc.myclose(fpout_withfamid)
    myfunc.myclose(fpout_withpdb)
    myfunc.myclose(fpout_fam2seqmap)
    return 0
Beispiel #3
0
        try:
            subprocess.check_output(["mkdir", "-p", outpath])
        except subprocess.CalledProcessError, e:
            print e
            return 1

    if myfunc.checkfile(pfamid2seqidFile, "pfamid2seqidFile") != 0:
        return 1

    if myfunc.checkfile("%s0.db" % topodb, "topodb") != 0:
        return 1

    if myfunc.checkfile("%s0.db" % seqdb, "seqdb") != 0:
        return 1

    pfamid2seqidDict = myfunc.ReadFam2SeqidMap(pfamid2seqidFile)

    hdl_topo = myfunc.MyDB(topodb)
    if not hdl_topo.failure:
        idSet_topo = set(hdl_topo.indexedIDList)
    else:
        idSet_topo = set([])
        print >> sys.stderr, "Failed to open topology database %s" % (topodb)
        return 1

    hdl_seq = myfunc.MyDB(seqdb)
    if hdl_seq.failure:
        print >> sys.stderr, "Failed to open sequence database %s" % (seqdb)
        return 1

    GetTMProList_per_family(pfamid2seqidDict, idSet_topo, hdl_seq, hdl_topo,
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1
    infile = ""
    outfile = ""
    seqid2pfamidFile = datadir3 + os.sep + "wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.seqid2pfamid"

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-seqid2pfamid", "--seqid2pfamid"]:
                seqid2pfamidFile = argv[i + 1]
                i += 2
            elif argv[i] in ["-o", "--o"]:
                outfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print("Error! Wrong argument:", argv[i], file=sys.stderr)
                return 1
        else:
            infile = argv[i]
            i += 1
    if infile == "" or not os.path.exists(infile):
        print("Error. Infile not set. exit", file=sys.stderr)
        return 1
    if seqid2pfamidFile == "" or not os.path.exists(seqid2pfamidFile):
        print("Error. seqid2pfamidFile does not exist. exit", file=sys.stderr)
        return 1
    seqid2pfamidDict = myfunc.ReadFam2SeqidMap(seqid2pfamidFile)
    if seqid2pfamidFile == {}:
        print("Read seqid2pfamidFile failed.", file=sys.stderr)
        return 1

    (idList, topoList) = myfunc.ReadFasta_without_annotation(infile)
    if len(idList) < 1:
        print("Read infile failed.", file=sys.stderr)
        return 1

    idList.remove("Consensus")
    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    WritePfamColorDef(idList, seqid2pfamidDict, fpout)

    myfunc.myclose(fpout)

    return 0
Beispiel #5
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    SPE_PAIR_LIST = [(2, 1), (2, 4), (2, 6), (2, 8), (3, 6), (3, 7), (4, 6),
                     (4, 8), (4, 10), (5, 7), (5, 10), (6, 8), (6, 10),
                     (6, 12), (7, 14), (8, 10), (8, 12), (10, 12), (10, 13),
                     (11, 13), (12, 14)]

    outfile = ""

    infile = ""
    pfamDefFile = "%s/data/pfam/pfam26.0/Pfam-A.clans.tsv" % (DATADIR3)
    signalpFile = "%s/wk/MPTopo/pfamAna_refpro/pred_signalp/refpro20120604-celluar.selmaxlength-m1.nr100.signalp_list" % (
        DATADIR3)

    #seqid2clanidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.seqid2clanid"%(DATADIR3)
    #seqid2pfamidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.seqid2pfamid"%(DATADIR3)
    seqid2clanidMapFile = ""
    seqid2pfamidMapFile = ""
    tm_pfamidListFile = ""
    tm_clanidListFile = ""
    pfamid2seqidMapFile = ""
    clanid2seqidMapFile = ""
    dbname_predTM = ""
    pairlistwithpfamidFile = ""

    pfamtype = ""

    pairListFile = ""

    #classList_TableNumTMHeatMap = ["ALL", "RMSP"]
    classList_TableNumTMHeatMap = ["ALL"]

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile"]:
                (outfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-outpath", "--outpath"]:
                (g_params['outpath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-l", "--l"]:
                (fileListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfamdef", "--pfamdef"]:
                (pfamDefFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-signalp", "--signalp"]:
                (signalpFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-mp", "--mp"]:
                g_params[
                    'pairwise_comparison_method'], i = myfunc.my_getopt_int(
                        argv, i)
            elif argv[i] in ["-mindiffpair", "--mindiffpair"]:
                g_params['mindiffpair'], i = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-pfamtype", "--pfamtype"]:
                pfamtype, i = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-clanidlist", "--clanidlist"]:
                (tm_clanidListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfamidlist", "--pfamidlist"]:
                (tm_pfamidListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqid2clanid", "--seqid2clanid"]:
                (seqid2clanidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqid2pfamid", "--seqid2pfamid"]:
                (seqid2pfamidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfamid2seqid", "--pfamid2seqid"]:
                (pfamid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-clanid2seqid", "--clanid2seqid"]:
                (clanid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pairlistwithpfamid", "--pairlistwithpfamid"]:
                (pairlistwithpfamidFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-predTMdbname", "--predTMdbname"]:
                (dbname_predTM, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pairlist", "--pairlist"]:
                (pairListFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-winsize", "--winsize"]:
                (g_params['winsize'], i) = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-outname", "--outname"]:
                (g_params['outname'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            elif argv[i] in ["-prokar", "--prokar"]:
                g_params['isOnlyAnaProkar'] = True
                i += 1
            elif argv[i] in ["-eukar", "--eukar"]:
                g_params['isOnlyAnaEukar'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1

    if myfunc.checkfile(
            infile, "%s (line %d): infile" %
        (__file__, inspect.currentframe().f_lineno)) != 0:
        return 1

    dirpath = myfunc.my_dirname(infile)

    # try to obtain Pfam family tag
    tag = ""
    if pfamtype != "":
        if pfamtype.upper().find("FAM") != -1:
            tag = ".Family"
        elif pfamtype.upper().find("DOM") != -1:
            tag = ".Domain"
        elif pfamtype.upper().find("REP") != -1:
            tag = ".Repeat"
        elif pfamtype.upper().find("MOT") != -1:
            tag = ".Motif"
        else:
            tag = ""
    else:
        if infile.find(".Family.") != -1:
            tag = ".Family"
        elif infile.find(".Domain.") != -1:
            tag = ".Domain"
        elif infile.find(".Repeat.") != -1:
            tag = ".Repeat"
        elif infile.find(".Motif.") != -1:
            tag = ".Motif"
        else:
            tag = ""

    if seqid2clanidMapFile == "":
        seqid2clanidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.nr100.filter.fragmented.seqid2clanid" % (
            DATADIR3)
    if myfunc.checkfile(
            seqid2clanidMapFile, "%s (line %d): seqid2clanidMapFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if seqid2pfamidMapFile == "":
        seqid2pfamidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.nr100.filter.fragmented.seqid2pfamid" % (
            DATADIR3, tag)
    if myfunc.checkfile(
            seqid2pfamidMapFile, "%s (line %d): seqid2pfamidMapFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if pfamid2seqidMapFile == "":
        pfamid2seqidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.nr100.filter.fragmented.pfamid2seqid" % (
            DATADIR3)
    if myfunc.checkfile(
            pfamid2seqidMapFile, "%s (line %d): pfamid2seqidMapFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if clanid2seqidMapFile == "":
        clanid2seqidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.nr100.filter.fragmented.clanid2seqid" % (
            DATADIR3, tag)
    if myfunc.checkfile(
            clanid2seqidMapFile, "%s (line %d): clanid2seqidMapFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if tm_pfamidListFile == "":
        tm_pfamidListFile = "%s/data/pfam/pfam26.0/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.pfamidlist" % (
            DATADIR3, tag)
    if myfunc.checkfile(
            tm_pfamidListFile, "%s (line %d): tm_pfamidListFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if tm_clanidListFile == "":
        tm_clanidListFile = "%s/data/pfam/pfam26.0/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.clanidlist" % (
            DATADIR3)
    if myfunc.checkfile(
            tm_clanidListFile, "%s (line %d): tm_clanidListFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if dbname_predTM == "":
        dbname_predTM = "%s/wk/MPTopo/pfamAna_refpro/pred_topcons_single_method4/refpro20120604-celluar.selmaxlength-m1.topcons-single_topcons_single.m1.agree-44.RMSP" % (
            DATADIR3)
    if myfunc.checkfile(
            "%s0.db" % (dbname_predTM), "%s (line %d): dbname_predTM" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    if g_params['isOnlyAnaProkar']:
        prokarseqidfile = "%s/data/uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.Prokaryota.seqidlist" % (
            DATADIR3)
        g_params['prokarSeqIDSet'] = set(myfunc.ReadIDList(prokarseqidfile))
        if len(g_params['prokarSeqIDSet']) < 1:
            return 1
    if g_params['isOnlyAnaEukar']:
        eukarseqidfile = "%s/data/uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.Eukaryota.seqidlist" % (
            DATADIR3)
        g_params['eukarSeqIDSet'] = set(myfunc.ReadIDList(eukarseqidfile))
        if len(g_params['eukarSeqIDSet']) < 1:
            return 1

    if pairlistwithpfamidFile == "":
        pairlistwithpfamidFile = "%s/../../Pfam-.maxpair100.pairlistwithpfamid" % (
            dirpath)
    if myfunc.checkfile(
            pairlistwithpfamidFile, "%s (line %d): pairlistwithpfamidFile" %
        (__file__, inspect.currentframe().f_lineno)):
        return 1

    pfamid_2_seqidpair_Dict = ReadPairListWithFamID(pairlistwithpfamidFile)
    usedPfamIDSet = set(
        pfamid_2_seqidpair_Dict.keys())  # pfamids used in pair selection

    if pairListFile != "":
        li = myfunc.ReadPairList(pairListFile)
        SPE_PAIR_LIST = []
        for tup in li:
            SPE_PAIR_LIST.append((int(tup[0]), int(tup[1])))

    (pfamidDefDict, clanidDefDict) = ReadPfamDefFile(pfamDefFile)
    signalpDict = lcmp.ReadSignalPDict(signalpFile)

    seqid2clanidDict = myfunc.ReadFam2SeqidMap(seqid2clanidMapFile)
    seqid2pfamidDict = myfunc.ReadFam2SeqidMap(seqid2pfamidMapFile)

    clanid2seqidDict = myfunc.ReadFam2SeqidMap(clanid2seqidMapFile)
    pfamid2seqidDict = myfunc.ReadFam2SeqidMap(pfamid2seqidMapFile)

    tm_pfamidList = myfunc.ReadIDList(tm_pfamidListFile)
    tm_clanidList = myfunc.ReadIDList(tm_clanidListFile)

    tm_pfamidSet = set(tm_pfamidList)
    tm_clanidSet = set(tm_clanidList)

    hdl_predTM = myfunc.MyDB(dbname_predTM)
    if not hdl_predTM.failure:
        idSet_TMpro = set(hdl_predTM.indexedIDList)
    else:
        idSet_TMpro = set([])

    #classList_TableNumTMHeatMap = ["ALL", "RMSP", "RMDUP"]
    #alignrangeList = ['FULL_ALIGNED', 'all', 'PART_ALIGNED']
    alignrangeList = ['FULL_ALIGNED']

    if g_params['outpath'] != "" and not os.path.exists(g_params['outpath']):
        cmd = ["mkdir", "-p", g_params['outpath']]
        try:
            subprocess.check_call(cmd)
        except subprocess.CalledProcessError, e:
            print e
            return 1
Beispiel #6
0
def main(g_params):#{{{
    argv = sys.argv
    numArgv=len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1
    i = 1
    isNonOptionArg=False
    isPickOne = False
    infile = ""
    pfamDefFile = '/data3/data/pfam/pfam26.0/Pfam-A.clans.tsv'
    seqDefFile = '/data3/wk/MPTopo/pfamAna/pfam2-selTM-giid-refseqid-pfamid-description.txt'
    idwithannoFile = "/data3/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.perTM75_nseq20.nr100.filter.fragmented.uniq.idwithanno"
    seqLengthFile = "/data3/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.perTM75_nseq20.nr100.filter.fragmented.uniq.seqlen"
    topoDB = "/data3/wk/MPTopo/pfamAna_refpro/pred_topcons/refpro20120604-celluar.selmaxlength-m1.topcons.result_TOPCONS.topo"
    seqDB = "/data3/wk/MPTopo/pfamAna_refpro/cellular_filter_fragment/Pfam-A-full.perTM75_nseq20.nr100.filter.fragmented.uniq"
    pfamscanFile = "/data3/wk/MPTopo/pfamAna_refpro/result_pfamscan/Pfam-A-full.perTM75_nseq20.nr100.filter.fragmented.pfamscan"

    outpath = ""
    outfile = ""
    htmlname = 'index'
    while i < numArgv:#{{{
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg=False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg=True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] ==  "-h" or  argv[i] == "--help":
                PrintHelp()
                return 1
            elif argv[i] in ["-outpath", "--outpath"]:
                outpath = argv[i+1]
                i += 2
            elif argv[i] in ["-o", "--o"]:
                (outfile, i) = myfunc.my_getopt_str(argv,i)
            elif argv[i] in ["-htmlname", "--htmlname"]:
                htmlname = argv[i+1]
                i += 2
            elif argv[i] in ["-seqlen", "--seqlen"]:
                seqLengthFile = argv[i+1]
                i += 2
            elif argv[i] in ["-topodb", "--topodb"]:
                topoDB = argv[i+1]
                i += 2
            elif argv[i] in ["-seqdb", "--seqdb"]:
                seqDB = argv[i+1]
                i += 2
            elif argv[i] in [ "-pfamdef", "--pfamdef"]:
                pfamDefFile = argv[i+1]
                i += 2
            elif argv[i] in ["-pfamscan", "--pfamscan"]:
                pfamscanFile = argv[i+1]
                i += 2
            elif argv[i] in ["-seqdef", "--seqdef"]:
                seqDefFile = argv[i+1]
                i += 2
            elif argv[i] in ["-q"]:
                isQuiet = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1
#}}}
    g_params['outpath'] = outpath 
    if outpath == "":
        print >> sys.stderr, "outpath not set"
        return 1
    elif outpath != "" and not os.path.exists(outpath):
        os.makedirs(outpath)

    if infile == "":
        print >> sys.stderr, "infile not set"
        return 1

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    seqid2pfamidDict = myfunc.ReadFam2SeqidMap(infile)
# group sequences by domains
    groupDict = {}
    for seqid in seqid2pfamidDict:
        famlist = seqid2pfamidDict[seqid]
        ss = "\t".join(famlist)
        if not ss in groupDict:
            groupDict[ss] = []
        groupDict[ss].append(seqid)
    groupList = []
    for ss in groupDict:
        groupList.append((ss, len(groupDict[ss]), groupDict[ss]))
    groupList = sorted(groupList, key=lambda x:x[1], reverse=True)

    seqlenDict = ReadSeqLengthDict(seqLengthFile)
    seqannoDict = ReadIDWithAnnoInfo(idwithannoFile)
    (pfamidDefDict, clanidDefDict) = lcmp.ReadPfamDefFile(pfamDefFile)

    topoDict = GetTopoDict(topoDB, seqid2pfamidDict.keys())
    seqDict = GetTopoDict(seqDB, seqid2pfamidDict.keys())

    pfamScanDict = myfunc.ReadPfamScan(pfamscanFile)
    groupedPfamScanDict = GroupPfamScanDict(pfamScanDict)

#     WriteHTML(seqid2pfamidDict, seqlenDict, seqannoDict, pfamidDefDict,
#             clanidDefDict, topoDict, htmlname, outpath)
    WriteInfo(groupList, seqlenDict, seqannoDict, pfamidDefDict,
            clanidDefDict, topoDict, groupedPfamScanDict, htmlname, fpout)
    MakeAlignment(groupList, seqDict, topoDict, outpath)
    myfunc.myclose(fpout)
    return 0
Beispiel #7
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = ""
    pairListFile = ""
    seqlenFile = ""
    shortid2fullidFile = ""
    seqid2pfamidMapFile = ""
    pfamDefFile = '/data3/data/pfam/pfam27.0/Pfam-A.clans.tsv'
    topodb = ""
    seqdb = ""
    pdb2spFile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            isNonOptionArg = False
            i += 1
            return 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outpath", "--outpath"]:
                (outpath, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-topodb", "--topodb"]:
                (topodb, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pdb2sp", "-pdb2sp", "-pdbtosp", "--pdbtosp"]:
                (pdb2spFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqdb", "--seqdb"]:
                (seqdb, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seqmsapath", "--seqmsapath"]:
                (g_params['seqmsapath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-datapath", "--datapath"]:
                (g_params['datapath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-seq2pfam", "--seq2pfam"]:
                (seqid2pfamidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfam2seq", "--pfam2seq"]:
                (pfamid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-description", "--description"]:
                (g_params['description'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pfamdef", "--pfamdef"]:
                (pfamDefFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-alignrange", "--alignrange"]:
                g_params['alignrange'], i = myfunc.my_getopt_str(argv, i)
                if not g_params['alignrange'] in ['all', 'full', 'part']:
                    print >> sys.stderr, "alignrange must be one of [all, full, part]"
                    return 1
                else:
                    if g_params['alignrange'] == 'full':
                        g_params['alignrange'] = 'FULL_ALIGNED'
                    elif g_params['alignrange'] == 'part':
                        g_params['alignrange'] = 'PART_ALIGNED'
            elif argv[i] in ["-basename", "--basename"]:
                (g_params['basename'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-treepath", "--treepath"]:
                (g_params['treepath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-pairalnpath", "--pairalnpath"]:
                (g_params['pairalnpath'], i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-maxperfamily", "--maxperfamily"]:
                (g_params['max_num_output_per_family'],
                 i) = myfunc.my_getopt_int(argv, i)
            elif argv[i] in ["-min-seqidt", "--min-seqidt"]:
                g_params['minSeqIDT'], i = myfunc.my_getopt_float(argv, i)
            elif argv[i] in ["-max-seqidt", "--max-seqidt"]:
                g_params['maxSeqIDT'], i = myfunc.my_getopt_float(argv, i)
            elif argv[i] in ["-shortid2fullid", "--shortid2fullid"]:
                (shortid2fullidFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-debug", "--debug"]:
                if argv[i + 1][0].lower() == 'y':
                    g_params['isDEBUG'] = True
                else:
                    g_params['isDEBUG'] = False
                i += 2
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            print >> sys.stderr, "Error! Wrong argument:", argv[i]
            return 1

    if g_params['basename'] == "":
        print >> sys.stderr, "basename not set. exit"
        return 1
    if myfunc.checkfile(g_params['datapath'], "datapath") != 0:
        return 1
    if myfunc.checkfile(seqid2pfamidMapFile, "seqid2pfamidMapFile") != 0:
        return 1
    if myfunc.checkfile(pfamid2seqidMapFile, "pfamid2seqidMapFile") != 0:
        return 1

    if myfunc.checkfile(topodb + "0.db", "topodb") != 0:
        return 1
    if myfunc.checkfile(seqdb + "0.db", "seqdb") != 0:
        return 1
    if myfunc.checkfile(g_params['seqmsapath'], "seqmsapath") != 0:
        return 1

    if pdb2spFile != "":
        (g_params['pdb2uniprotMap'],
         g_params['uniprot2pdbMap']) = myfunc.ReadPDBTOSP(pdb2spFile)

    if g_params['datapath'] == "":
        print >> sys.stderr, "datapath not set"
        return 1
    elif not os.path.exists(g_params['datapath']):
        print >> sys.stderr, "datapath %s does not exist" % (
            g_params['datapath'])
        return 1

    if outpath == "":
        print >> sys.stderr, "outpath not set"
        return 1
    elif not os.path.exists(outpath):
        cmd = ["mkdir", "-p", outpath]
        subprocess.check_call(cmd)

    paircmpfile = "%s/%s.paircmp" % (g_params['datapath'],
                                     g_params['basename'])
    if myfunc.checkfile(paircmpfile, "paircmpfile") != 0:
        return 1

    (g_params['pfamidDefDict'],
     g_params['clanidDefDict']) = lcmp.ReadPfamDefFile(pfamDefFile)

    g_params['seqid2pfamidDict'] = myfunc.ReadFam2SeqidMap(seqid2pfamidMapFile)
    g_params['pfamid2seqidDict'] = myfunc.ReadFam2SeqidMap(pfamid2seqidMapFile)

    tmpdir = tempfile.mkdtemp()
    if g_params['msapath'] == "":
        g_params['msapath'] = tmpdir
    if g_params['treepath'] == "":
        g_params['treepath'] = tmpdir
    if g_params['pairalnpath'] == "":
        g_params['pairalnpath'] = tmpdir

    pairCmpRecordList = []
    unprocessedBuffer = ""
    cntTotalReadInRecord = 0
    cntTotalOutputRecord = 0
    isEOFreached = False
    try:
        fpin = open(paircmpfile, "r")
    except IOError:
        print >> sys.stderr, "Failed to open input file %s" % (paircmpfile)
        return 1
    while 1:
        buff = fpin.read(myfunc.BLOCK_SIZE)
        if buff == "":
            isEOFreached = True
        buff = unprocessedBuffer + buff
        rdList = []
        unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff, rdList)
        rdList = FilterPairCmpResult(rdList)
        cntTotalReadInRecord += len(rdList)
        pairCmpRecordList += rdList
        if isEOFreached == True:
            break
    fpin.close()

    print "cntTotalReadInRecord =", cntTotalReadInRecord

    g_params['hdl_seqdb'] = myfunc.MyDB(seqdb)
    g_params['hdl_topodb'] = myfunc.MyDB(topodb)

    g_params['OS'] = os.uname()[0]
    if g_params['OS'].find('Linux') != -1:
        g_params['CP_EXE'] = "/bin/cp -uf"
    else:
        g_params['CP_EXE'] = "/bin/cp -f"

    if shortid2fullidFile != "":
        g_params['uniprotAC2FullSeqIDMap'] = myfunc.ReadID2IDMap(
            shortid2fullidFile)

    addname = ""
    if g_params['alignrange'] != 'all':
        addname += ".%s" % (g_params['alignrange'])

    dataTable = {}
    # structure of dataTable
    #    dataTable[pfamid] = {'set_seqid':set(), 'difftopopair':[{'INV':[(id1,id2)]},{'TM2GAP':},{}}

    # first read in pairCmpRecordList
    AddAllSeqInPairCmp(dataTable, pairCmpRecordList,
                       g_params['seqid2pfamidDict'])

    pairInfoFileList = []
    for cmpclass in g_params['cmpClassList_mp3_cmpdup'][0:]:
        ss = "%s/%s_.cmpdup.FULL_ALIGNED.%s.pairinfo.txt" % (
            g_params['datapath'], g_params['basename'], cmpclass)
        pairInfoFileList.append(ss)
        pairinfoList = ReadPairInfo_cmpclass(ss)
        AddPairInfo(dataTable, pairinfoList, cmpclass)
#     print "\n".join(pairInfoFileList)
    if g_params['isDEBUG']:  #{{{
        for pfamid in dataTable:
            print pfamid
            print "\tset_seqid"
            print dataTable[pfamid]['set_seqid']
            print "\tdifftopopair"
            for cls in dataTable[pfamid]['difftopopair']:
                print "\t\t", cls
                for tup in dataTable[pfamid]['difftopopair'][cls]:
                    print "\t\t\t", tup  #}}}

    WriteHTML(dataTable, outpath)

    os.system("rm -rf %s" % (tmpdir))