Ejemplo n.º 1
0
def main():
    if len(sys.argv)!=6:
        print ("python blmniisvm_experiment.py [e|ic|gpcr|nr] [clustMethod] "
                "[dataPath] [clusterPath] [outPath]")
        return

    dataset = sys.argv[1]
    method = sys.argv[2]
    dataPath = sys.argv[3]
    clusterPath = sys.argv[4]
    outPath = sys.argv[5]

    print "Loading Adjacency"
    connMat,comList,proList = yam.loadComProConnMat(dataset,dataPath+"/Adjacency")
    nComp = len(comList)
    nProtein = len(proList)

    print "Loading Cluster"
    comClust = loadCluster(clusterPath+"/cluster_"+method+"_com_"+dataset+".json",comList)
    proClust = loadCluster(clusterPath+"/cluster_"+method+"_pro_"+dataset+".json",proList)

    print "Generate Negative Data"
    connMat = genNegativeData(connMat,proClust,comClust)

    print "Writing Output To "+outPath
    connMat = [[row[i] for row in connMat] for i in range(len(connMat[0]))]
    with open(outPath+"/admat_dgc_"+dataset+"_negative.txt",'w') as f:
        for i,c in enumerate(comList):
            if i>0:
                f.write(" ")
            f.write(str(c))
        f.write("\n")
        for i,r in enumerate(connMat):
            f.write(proList[i].ljust(7))
            for j,c in enumerate(r):
                f.write(" ")
                f.write(str(c))
            f.write("\n")

    print "Stats: "
    unlabeled = 0
    negative = 0
    positive = 0
    total = nComp*nProtein

    for i in connMat:
        for j in i:
            if j == 0:
                unlabeled += 1
            elif j == -1:
                negative += 1
            elif j == 1:
                positive += 1

    print "Total Data: "+str(total)
    print "Positive Data: "+str(positive)
    print "Unlabeled Data: "+str(unlabeled)
    print "Negative Data: "+str(negative)
Ejemplo n.º 2
0
def main(argv):
    if len(argv) != 3:
        print 'USAGE: python devel.py [dataMode] [valMode]'
        return

    dataMode = argv[1]
    valMode = argv[2]

    # load development dataset, containing com-pro connectivity
    connMat, comList, proList = yam.loadComProConnMat(dataMode)
    kernel = yam.loadKernel(dataMode)

    ##
    dataX = []
    dataY = []
    for i, ii in enumerate(comList):
        for j, jj in enumerate(proList):
            dataX.append((ii, jj))
            dataY.append(connMat[i][j])
    nData = len(dataY)

    ##
    nFolds = None
    kfList = None
    if valMode == 'loocv':
        nFolds = nData
        kfList = KFold(nData, n_folds=nFolds, shuffle=True)
    elif valMode == 'kfcv':
        nFolds = 10
        kfList = StratifiedKFold(dataY, n_folds=nFolds, shuffle=True)
    else:
        assert (False)

    kronrls = KronRLS(connMat, comList, proList, kernel)

    ## prep for parallel
    xTestList = []
    yTestList = []
    for trIdxList, testIdxList in kfList:
        xTest = [dataX[i] for i in testIdxList]
        yTest = [dataY[i] for i in testIdxList]

        xTestList.append(xTest)
        yTestList.append(yTest)

    ##
    yPredList = fu.map(evalPerFold, xTestList, yTestList, [kronrls] * nFolds,
                       [connMat] * nFolds, [comList] * nFolds,
                       [proList] * nFolds, [kernel] * nFolds)
Ejemplo n.º 3
0
def main(argv):
    if len(argv)!=3:
        print 'USAGE: python devel.py [dataMode] [valMode]'
        return

    dataMode = argv[1]
    valMode = argv[2]

    # load development dataset, containing com-pro connectivity
    connMat,comList,proList = yam.loadComProConnMat(dataMode)
    kernel = yam.loadKernel(dataMode)

    ##
    dataX = []
    dataY = []
    for i,ii in enumerate(comList):
        for j,jj in enumerate(proList):
            dataX.append( (ii,jj) )
            dataY.append( connMat[i][j] )
    nData = len(dataY)

    ##
    nFolds = None
    kfList = None
    if valMode=='loocv':
        nFolds = nData
        kfList = KFold(nData, n_folds=nFolds, shuffle=True)
    elif valMode=='kfcv':
        nFolds = 10
        kfList = StratifiedKFold(dataY, n_folds=nFolds, shuffle=True)
    else:
        assert(False)

    kronrls = KronRLS(connMat,comList,proList,kernel)

    ## prep for parallel
    xTestList = []
    yTestList = []
    for trIdxList, testIdxList in kfList:
        xTest = [dataX[i] for i in testIdxList]
        yTest = [dataY[i] for i in testIdxList]

        xTestList.append(xTest)
        yTestList.append(yTest)

    ##
    yPredList = fu.map(evalPerFold,xTestList,yTestList,[kronrls]*nFolds,
                       [connMat]*nFolds,[comList]*nFolds,[proList]*nFolds,[kernel]*nFolds)
Ejemplo n.º 4
0
def main(argv):
    if len(argv) != 3:
        print 'USAGE: '
        print 'python devel.py [dataMode:e/nr/gpcr/ic] [valMode:loocv/kfcv]'
        return

    dataMode = argv[1]
    valMode = argv[2]

    # load development dataset, containing com-pro connectivity
    connMatDpath = '../../dataset/connectivity/compound_vs_protein/yamanishi/ground-truth'
    connMat, comList, proList = yam.loadComProConnMat(dataMode, connMatDpath)

    kernelDpath = '../../dataset/connectivity/compound_vs_protein/yamanishi/similarity-mat'
    kernel = yam.loadKernel(dataMode, kernelDpath)

    ##
    dataX = []
    dataY = []
    for i, ii in enumerate(comList):
        for j, jj in enumerate(proList):
            dataX.append((ii, jj))
            dataY.append(connMat[i][j])
    nData = len(dataY)
    print 'nData= ' + str(nData)

    ## instantiate a KronRLS predictor
    kronrls = KronRLS(cfg, connMat, comList, proList, kernel)

    ##
    nFolds = None
    kfList = None
    if valMode == 'loocv':
        nFolds = nData
        kf = KFold(n_splits=nFolds)
        kfList = kf.split(dataX)
    elif valMode == 'kfcv':
        nFolds = 10
        skf = StratifiedKFold(n_splits=nFolds)
        kfList = skf.split(dataX, dataY)
    else:
        assert False, 'Unknown valMode'

    yTestList = []
    yPredList = []
    fold = 0
    for trIdxList, testIdxList in kfList:
        fold += 1
        print 'fold=', fold, 'of', nFolds, '######################################'

        xTest = [dataX[i] for i in testIdxList]
        yTest = [dataY[i] for i in testIdxList]
        # xTr = [dataX[i] for i in trIdxList]
        # yTr = [dataY[i] for i in trIdxList]

        # test
        yPred = kronrls.predict(xTest)

        yTestList += yTest
        yPredList += yPred

    ##
    print 'calculating aupr...'
    precision, recall, _ = precision_recall_curve(yTestList, yPredList)
    aupr = average_precision_score(yTestList, yPredList, average='micro')

    ##
    print 'plotting ...'
    plt.clf()
    plt.figure()

    plt.plot(recall, precision, 'r-', label='(area = %0.2f)' % aupr, lw=2)

    plt.ylim([-0.05, 1.05])
    plt.xlim([-0.05, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve of ' + dataMode + ' ' + valMode)
    plt.legend(loc="lower left")

    fname = 'pr_curve_' + dataMode + '_' + valMode + '.png'
    plt.savefig(outDir + fname, bbox_inches='tight')
Ejemplo n.º 5
0
def auprTest():
    global classParam,comSimMat,proSimMat,connMat,comNetSim,proNetSim
    classParam = blmniiConfig
    classParam["proba"] = False
    classParam["data"] = "precomputed"
    classParam["kernel"] = ["precomputed","rbf"]
    classParam["retValue"] = "score"
    if len(sys.argv)!=7:
        print "Usage: python devel.py aupr [numCore] [DataSetCode] [evalMode] [dataPath] [outPath]"
        return

    core = int(sys.argv[2])
    dataset = sys.argv[3]
    evalMode = sys.argv[4]
    dataPath = sys.argv[5]
    outPath = sys.argv[6]
    print "Building Data"
    connMat,comList,proList = yam.loadComProConnMat(dataset,dataPath+"/Adjacency")
    kernel = yam.loadKernel(dataset,dataPath)

    proListIdx = [i for i,_ in enumerate(proList)]
    comListIdx = [i for i,_ in enumerate(comList)]

    nComp = len(comList)
    nProtein = len(proList)

    comSimMat = np.zeros((nComp,nComp), dtype=float)
    proSimMat = np.zeros((nProtein,nProtein), dtype=float)
    for row,i in enumerate(comList):
        for col,j in enumerate(comList):
            comSimMat[row][col] = (kernel[(i,j)]+kernel[(j,i)])/2

    for row,i in enumerate(proList):
        for col,j in enumerate(proList):
            proSimMat[row][col] = (kernel[(i,j)]+kernel[(j,i)])/2


    comSimMat = {"precomputed":regularizationKernel(comSimMat)}
    proSimMat = {"precomputed":regularizationKernel(proSimMat)}

    pairData = []
    connList = []
    print "Split Dataset..."
    if evalMode == "loocv":
        nFold = len(comListIdx)
        kSplit = KFold(n_splits=nFold,shuffle=True)
        comSplit = kSplit.split(comListIdx)

        nFold = len(proListIdx)
        kSplit = KFold(n_splits=nFold,shuffle=True)
        proSplit = kSplit.split(proListIdx)

    elif evalMode == "kfold":
        nFold = 10
        kSplit = KFold(n_splits=nFold, shuffle=True)
        comSplit = kSplit.split(comListIdx)
        proSplit = kSplit.split(proListIdx)

    else:
        assert(False)

    predictedData = np.zeros((len(comList),len(proList)),dtype=float)
    splitPred = []
    proTestList = []
    proTrainList = []
    comTestList = []
    comTrainList = []

    for trainIndex, testIndex in proSplit:
        proTestList.append([i for i in testIndex])
        proTrainList.append([i for i in trainIndex])
    for trainIndex, testIndex in comSplit:
        comTestList.append([i for i in testIndex])
        comTrainList.append([i for i in trainIndex])

    if core == 1:
        predRes,testData = singleProc(comTrainList,proTrainList,comTestList,proTestList)
    elif core > 1:
        tempPred = BLMNII(classParam)
        tempPred.setAttr(connMat=connMat,comSimMat=comSimMat,proSimMat=proSimMat)

        comSimMat["rbf"] = regularizationKernel(tempPred.makeNetSim("com"))
        proSimMat["rbf"] = regularizationKernel(tempPred.makeNetSim("pro"))
        tempPred.setAttr(comSimMat=comSimMat,proSimMat=proSimMat)

        predRes,testData = parallelProc(core,comTrainList,proTrainList,comTestList,proTestList)
    else:
        print "Error: Invalid core processor number"
        return
#####################################################################
    print "\nCalculate Performance"
    key = 'BLM-NII'
    precision, recall, _ = precision_recall_curve(testData, predRes)
    prAUC = average_precision_score(testData, predRes, average='micro')

    print "Visualiation"
    lineType = 'k-.'

    perf = {'precision': precision, 'recall': recall, 'prAUC': prAUC,
                 'lineType': lineType}
    perf2 = {'prAUC': prAUC, 'nTest': nComp*nProtein}

    with open(outPath+'perf_blmnii_'+dataset +'_'+evalMode+'_'+str(classParam["alpha"])+'_'+str(classParam["gamma"])+'_perf.json', 'w') as fp:
        json.dump(perf2, fp, indent=2, sort_keys=True)

    plt.clf()
    plt.figure()
    plt.plot(perf['recall'], perf['precision'], perf['lineType'], label= key+' (area = %0.2f)' % perf['prAUC'], lw=2)
    plt.ylim([-0.05, 1.05])
    plt.xlim([-0.05, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")
    plt.savefig(outPath+'/'+ dataset +'_'+evalMode+'_'+str(classParam["alpha"])+'_'+str(classParam["gamma"])+'_pr_curve_.png', bbox_inches='tight')
Ejemplo n.º 6
0
def main():
    if len(sys.argv) != 2:
        print 'USAGE:'
        print 'python cluster2.py [targetDir]'
        print '/param targetDir: dir containing one compound and one protein clustering results'
        return

    tdir = sys.argv[1]
    metrics = ['calinskiharabaz', 'silhouette']
    modes = ['compound', 'protein']

    ##
    print 'loading connMat...'
    dataset = tdir.split('/')[-1].split('-')[-1]
    dParam = dataset.split('#')
    dpath = os.path.join(DATASET_DIR, dParam[0], 'ground-truth')
    connMat, comList, proList = yam.loadComProConnMat(dParam[1], dpath)

    ##
    print 'loading compound and protein clusters...'

    def _loadCluster(mode, metric):
        dname = [i for i in os.listdir(tdir) if (mode in i)][0]
        dpath = os.path.join(tdir, dname)
        assert os.path.isdir(dpath)
        item2clusterlabel = dict()
        clusterlabel2item = defaultdict(list)
        fname = '_'.join([mode, metric, 'bestlabels.json'])
        with open(os.path.join(dpath, fname), 'r') as f:
            item2clusterlabel = yaml.load(f)
        for k, v in item2clusterlabel.iteritems():
            clusterlabel2item[v].append(k)
        return {
            'item2clusterlabel': item2clusterlabel,
            'clusterlabel2item': clusterlabel2item
        }

    clusterData = {}
    for mode in modes:
        for metric in metrics:
            clusterData[(mode, metric)] = _loadCluster(mode, metric)

    ##
    def _getNumberOfConnBetweenComProClusters(comMetric, proMetric):
        nConnDict = {}
        for comCluster in clusterData[('compound',
                                       comMetric)]['clusterlabel2item']:
            if (comCluster == -1): continue
            for proCluster in clusterData[('protein',
                                           proMetric)]['clusterlabel2item']:
                if (proCluster == -1): continue
                nConn = 0
                for com in clusterData[(
                        'compound',
                        comMetric)]['clusterlabel2item'][comCluster]:
                    for pro in clusterData[(
                            'protein',
                            proMetric)]['clusterlabel2item'][proCluster]:
                        conn = int(
                            connMat[comList.index(com)][proList.index(pro)])
                        if conn == 1: nConn += 1
                nConnDict[(comCluster, proCluster)] = nConn
        return nConnDict

    connAmongComProClusters = dict()
    for comMet in metrics:
        for proMet in metrics:
            print 'get clusterConn of ' + comMet + ' and ' + proMet
            connAmongComProClusters[(
                comMet, proMet)] = _getNumberOfConnBetweenComProClusters(
                    comMet, proMet)

    ##
    for comMet in metrics:
        for proMet in metrics:
            print 'getting connMat2 of ' + comMet + ' and ' + proMet
            connMat2 = np.copy(connMat)
            for i in range(connMat.shape[0]):
                for j in range(connMat.shape[1]):
                    if connMat[i][j] == 1:
                        continue  # because of known positive interaction
                    comCluster = clusterData[(
                        'compound', comMet)]['item2clusterlabel'][comList[i]]
                    proCluster = clusterData[(
                        'protein', proMet)]['item2clusterlabel'][proList[j]]
                    if (comCluster == -1) or (proCluster == -1):
                        continue  # because of outlier cluster label
                    if connAmongComProClusters[(comMet,
                                                proMet)][(comCluster,
                                                          proCluster)] == 0:
                        connMat2[i][j] = -1

            connDict = defaultdict(list)
            connDict2 = defaultdict(list)
            connDictRaw = util.connMat2Dict(connMat2, comList, proList)
            for k, v in connDictRaw.iteritems():
                connDict[int(v)].append(k)
            for k, v in connDict.iteritems():
                connDict2[k].append(len(v))
            summ = sum([v[0] for v in connDict2.values()])
            for k, v in connDict2.iteritems():
                connDict2[k].append(float(v[0]) / summ)

            ##
            tag = '_'.join([comMet, proMet])
            # np.savetxt(os.path.join(tdir,tag+'_connMat.csv'),connMat2,delimiter=',')
            # with open(os.path.join(tdir,tag+"labels.json"),'w') as f:
            #     json.dump(connDict,f,indent=2,sort_keys=True)
            with open(os.path.join(tdir, tag + "_labels.pkl"), 'w') as f:
                pickle.dump(connDict, f)
            with open(os.path.join(tdir, tag + "_labels_stat.json"), 'w') as f:
                json.dump(connDict2, f, indent=2, sort_keys=True)
Ejemplo n.º 7
0
def main(argv):
    if len(argv)!=3:
        print 'USAGE: '
        print 'python devel.py [dataMode:e/nr/gpcr/ic] [valMode:loocv/kfcv]'
        return

    dataMode = argv[1]
    valMode = argv[2]

    # load development dataset, containing com-pro connectivity
    connMatDpath = '../../dataset/connectivity/compound_vs_protein/yamanishi/ground-truth'
    connMat,comList,proList = yam.loadComProConnMat(dataMode,connMatDpath)

    kernelDpath = '../../dataset/connectivity/compound_vs_protein/yamanishi/similarity-mat'
    kernel = yam.loadKernel(dataMode,kernelDpath)

    ##
    dataX = []
    dataY = []
    for i,ii in enumerate(comList):
        for j,jj in enumerate(proList):
            dataX.append( (ii,jj) )
            dataY.append( connMat[i][j] )
    nData = len(dataY)
    print 'nData= '+str(nData)

    ## instantiate a KronRLS predictor
    kronrls = KronRLS(cfg,connMat,comList,proList,kernel)

    ##
    nFolds = None
    kfList = None
    if valMode=='loocv':
        nFolds = nData
        kf = KFold(n_splits=nFolds)
        kfList = kf.split(dataX)
    elif valMode=='kfcv':
        nFolds = 10
        skf = StratifiedKFold(n_splits=nFolds)
        kfList = skf.split(dataX,dataY)
    else:
        assert False,'Unknown valMode'

    yTestList = []
    yPredList = []
    fold = 0
    for trIdxList, testIdxList in kfList:
        fold += 1
        print 'fold=',fold,'of',nFolds,'######################################'

        xTest = [dataX[i] for i in testIdxList]
        yTest = [dataY[i] for i in testIdxList]
        # xTr = [dataX[i] for i in trIdxList]
        # yTr = [dataY[i] for i in trIdxList]

        # test
        yPred = kronrls.predict(xTest)

        yTestList += yTest
        yPredList += yPred

    ##
    print 'calculating aupr...'
    precision, recall, _ = precision_recall_curve(yTestList, yPredList)
    aupr = average_precision_score(yTestList, yPredList, average='micro')

    ##
    print 'plotting ...'
    plt.clf()
    plt.figure()

    plt.plot(recall, precision, 'r-',
             label= '(area = %0.2f)' % aupr, lw=2)

    plt.ylim([-0.05, 1.05])
    plt.xlim([-0.05, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve of '+dataMode+' '+valMode)
    plt.legend(loc="lower left")

    fname = 'pr_curve_'+dataMode+'_'+valMode+'.png'
    plt.savefig(outDir+fname, bbox_inches='tight')
Ejemplo n.º 8
0
def main():
    if len(sys.argv) != 6:
        print(
            "python blmniisvm_experiment.py [e|ic|gpcr|nr] [clustMethod] "
            "[dataPath] [clusterPath] [outPath]")
        return

    dataset = sys.argv[1]
    method = sys.argv[2]
    dataPath = sys.argv[3]
    clusterPath = sys.argv[4]
    outPath = sys.argv[5]

    print "Loading Adjacency"
    connMat, comList, proList = yam.loadComProConnMat(dataset,
                                                      dataPath + "/Adjacency")
    nComp = len(comList)
    nProtein = len(proList)

    print "Loading Cluster"
    comClust = loadCluster(
        clusterPath + "/cluster_" + method + "_com_" + dataset + ".json",
        comList)
    proClust = loadCluster(
        clusterPath + "/cluster_" + method + "_pro_" + dataset + ".json",
        proList)

    print "Generate Negative Data"
    connMat = genNegativeData(connMat, proClust, comClust)

    print "Writing Output To " + outPath
    connMat = [[row[i] for row in connMat] for i in range(len(connMat[0]))]
    with open(outPath + "/admat_dgc_" + dataset + "_negative.txt", 'w') as f:
        for i, c in enumerate(comList):
            if i > 0:
                f.write(" ")
            f.write(str(c))
        f.write("\n")
        for i, r in enumerate(connMat):
            f.write(proList[i].ljust(7))
            for j, c in enumerate(r):
                f.write(" ")
                f.write(str(c))
            f.write("\n")

    print "Stats: "
    unlabeled = 0
    negative = 0
    positive = 0
    total = nComp * nProtein

    for i in connMat:
        for j in i:
            if j == 0:
                unlabeled += 1
            elif j == -1:
                negative += 1
            elif j == 1:
                positive += 1

    print "Total Data: " + str(total)
    print "Positive Data: " + str(positive)
    print "Unlabeled Data: " + str(unlabeled)
    print "Negative Data: " + str(negative)
Ejemplo n.º 9
0
def auprTest():
    global classParam, comSimMat, proSimMat, connMat, comNetSim, proNetSim
    classParam = blmniiConfig
    classParam["proba"] = False
    classParam["data"] = "precomputed"
    classParam["kernel"] = ["precomputed", "rbf"]
    classParam["retValue"] = "score"
    if len(sys.argv) != 7:
        print "Usage: python devel.py aupr [numCore] [DataSetCode] [evalMode] [dataPath] [outPath]"
        return

    core = int(sys.argv[2])
    dataset = sys.argv[3]
    evalMode = sys.argv[4]
    dataPath = sys.argv[5]
    outPath = sys.argv[6]
    print "Building Data"
    connMat, comList, proList = yam.loadComProConnMat(dataset,
                                                      dataPath + "/Adjacency")
    kernel = yam.loadKernel(dataset, dataPath)

    proListIdx = [i for i, _ in enumerate(proList)]
    comListIdx = [i for i, _ in enumerate(comList)]

    nComp = len(comList)
    nProtein = len(proList)

    comSimMat = np.zeros((nComp, nComp), dtype=float)
    proSimMat = np.zeros((nProtein, nProtein), dtype=float)
    for row, i in enumerate(comList):
        for col, j in enumerate(comList):
            comSimMat[row][col] = (kernel[(i, j)] + kernel[(j, i)]) / 2

    for row, i in enumerate(proList):
        for col, j in enumerate(proList):
            proSimMat[row][col] = (kernel[(i, j)] + kernel[(j, i)]) / 2

    comSimMat = {"precomputed": regularizationKernel(comSimMat)}
    proSimMat = {"precomputed": regularizationKernel(proSimMat)}

    pairData = []
    connList = []
    print "Split Dataset..."
    if evalMode == "loocv":
        nFold = len(comListIdx)
        kSplit = KFold(n_splits=nFold, shuffle=True)
        comSplit = kSplit.split(comListIdx)

        nFold = len(proListIdx)
        kSplit = KFold(n_splits=nFold, shuffle=True)
        proSplit = kSplit.split(proListIdx)

    elif evalMode == "kfold":
        nFold = 10
        kSplit = KFold(n_splits=nFold, shuffle=True)
        comSplit = kSplit.split(comListIdx)
        proSplit = kSplit.split(proListIdx)

    else:
        assert (False)

    predictedData = np.zeros((len(comList), len(proList)), dtype=float)
    splitPred = []
    proTestList = []
    proTrainList = []
    comTestList = []
    comTrainList = []

    for trainIndex, testIndex in proSplit:
        proTestList.append([i for i in testIndex])
        proTrainList.append([i for i in trainIndex])
    for trainIndex, testIndex in comSplit:
        comTestList.append([i for i in testIndex])
        comTrainList.append([i for i in trainIndex])

    if core == 1:
        predRes, testData = singleProc(comTrainList, proTrainList, comTestList,
                                       proTestList)
    elif core > 1:
        tempPred = BLMNII(classParam)
        tempPred.setAttr(connMat=connMat,
                         comSimMat=comSimMat,
                         proSimMat=proSimMat)

        comSimMat["rbf"] = regularizationKernel(tempPred.makeNetSim("com"))
        proSimMat["rbf"] = regularizationKernel(tempPred.makeNetSim("pro"))
        tempPred.setAttr(comSimMat=comSimMat, proSimMat=proSimMat)

        predRes, testData = parallelProc(core, comTrainList, proTrainList,
                                         comTestList, proTestList)
    else:
        print "Error: Invalid core processor number"
        return


#####################################################################
    print "\nCalculate Performance"
    key = 'BLM-NII'
    precision, recall, _ = precision_recall_curve(testData, predRes)
    prAUC = average_precision_score(testData, predRes, average='micro')

    print "Visualiation"
    lineType = 'k-.'

    perf = {
        'precision': precision,
        'recall': recall,
        'prAUC': prAUC,
        'lineType': lineType
    }
    perf2 = {'prAUC': prAUC, 'nTest': nComp * nProtein}

    with open(
            outPath + 'perf_blmnii_' + dataset + '_' + evalMode + '_' +
            str(classParam["alpha"]) + '_' + str(classParam["gamma"]) +
            '_perf.json', 'w') as fp:
        json.dump(perf2, fp, indent=2, sort_keys=True)

    plt.clf()
    plt.figure()
    plt.plot(perf['recall'],
             perf['precision'],
             perf['lineType'],
             label=key + ' (area = %0.2f)' % perf['prAUC'],
             lw=2)
    plt.ylim([-0.05, 1.05])
    plt.xlim([-0.05, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")
    plt.savefig(outPath + '/' + dataset + '_' + evalMode + '_' +
                str(classParam["alpha"]) + '_' + str(classParam["gamma"]) +
                '_pr_curve_.png',
                bbox_inches='tight')
Ejemplo n.º 10
0
def main():
    if len(sys.argv)!=2:
        print 'USAGE:'
        print 'python cluster2.py [targetDir]'
        print '/param targetDir: dir containing one compound and one protein clustering results'
        return

    tdir = sys.argv[1]
    metrics = ['calinskiharabaz','silhouette']
    modes = ['compound','protein']

    ##
    print 'loading connMat...'
    dataset = tdir.split('/')[-1].split('-')[-1]
    dParam = dataset.split('#')
    dpath = os.path.join(DATASET_DIR,dParam[0],'ground-truth')
    connMat,comList,proList = yam.loadComProConnMat(dParam[1],dpath)

    ##
    print 'loading compound and protein clusters...'
    def _loadCluster(mode,metric):
        dname = [i for i in os.listdir(tdir) if (mode in i)][0]
        dpath = os.path.join(tdir,dname); assert os.path.isdir(dpath)
        item2clusterlabel = dict(); clusterlabel2item = defaultdict(list)
        fname = '_'.join([mode,metric,'bestlabels.json'])
        with open(os.path.join(dpath,fname),'r') as f: item2clusterlabel = yaml.load(f)
        for k,v in item2clusterlabel.iteritems(): clusterlabel2item[v].append(k)
        return {'item2clusterlabel':item2clusterlabel,'clusterlabel2item':clusterlabel2item}

    clusterData = {}
    for mode in modes:
        for metric in metrics:
            clusterData[(mode,metric)] = _loadCluster(mode,metric)

    ##
    def _getNumberOfConnBetweenComProClusters(comMetric,proMetric):
        nConnDict = {}
        for comCluster in clusterData[('compound',comMetric)]['clusterlabel2item']:
            if (comCluster==-1): continue
            for proCluster in clusterData[('protein',proMetric)]['clusterlabel2item']:
                if (proCluster==-1): continue
                nConn = 0
                for com in clusterData[('compound',comMetric)]['clusterlabel2item'][comCluster]:
                    for pro in clusterData[('protein',proMetric)]['clusterlabel2item'][proCluster]:
                        conn = int( connMat[comList.index(com)][proList.index(pro)] )
                        if conn==1: nConn += 1
                nConnDict[(comCluster,proCluster)] = nConn
        return nConnDict

    connAmongComProClusters = dict()
    for comMet in metrics:
        for proMet in metrics:
            print 'get clusterConn of '+comMet+' and '+proMet
            connAmongComProClusters[(comMet,proMet)] =  _getNumberOfConnBetweenComProClusters(comMet,proMet)

    ##
    for comMet in metrics:
        for proMet in metrics:
            print 'getting connMat2 of '+comMet+' and '+proMet
            connMat2 = np.copy(connMat)
            for i in range(connMat.shape[0]):
                for j in range(connMat.shape[1]):
                    if connMat[i][j]==1: continue # because of known positive interaction
                    comCluster = clusterData[('compound',comMet)]['item2clusterlabel'][ comList[i] ]
                    proCluster = clusterData[('protein',proMet)]['item2clusterlabel'][ proList[j] ]
                    if (comCluster==-1)or(proCluster==-1): continue # because of outlier cluster label
                    if connAmongComProClusters[(comMet,proMet)][(comCluster,proCluster)]==0:
                        connMat2[i][j] = -1

            connDict = defaultdict(list); connDict2 = defaultdict(list)
            connDictRaw = util.connMat2Dict(connMat2,comList,proList)
            for k,v in connDictRaw.iteritems(): connDict[int(v)].append(k)
            for k,v in connDict.iteritems(): connDict2[k].append(len(v))
            summ = sum([v[0] for v in connDict2.values()])
            for k,v in connDict2.iteritems(): connDict2[k].append(float(v[0])/summ)

            ##
            tag = '_'.join([comMet,proMet])
            # np.savetxt(os.path.join(tdir,tag+'_connMat.csv'),connMat2,delimiter=',')
            # with open(os.path.join(tdir,tag+"labels.json"),'w') as f:
            #     json.dump(connDict,f,indent=2,sort_keys=True)
            with open(os.path.join(tdir,tag+"_labels.pkl"),'w') as f:
                pickle.dump(connDict,f)
            with open(os.path.join(tdir,tag+"_labels_stat.json"),'w') as f:
                json.dump(connDict2,f,indent=2,sort_keys=True)
Ejemplo n.º 11
0
def main():
    if len(sys.argv)!=5:
        print ("python blmniisvm_experiment.py [DataSetCode] [evalMode]"
                " [dataPath] [outPath]")
        return
    classParam = dict(name='blmnii',proba=True)

    dataset = sys.argv[1]
    evalMode = sys.argv[2]
    dataPath = sys.argv[3]
    outPath = sys.argv[4]

    print "Building Data"
    connMat,comList,proList = yam.loadComProConnMat(dataset,dataPath+"/Adjacency")
    kernel = yam.loadKernel(dataset,dataPath)

    comListIdx = [i for i,_ in enumerate(comList)]
    proListIdx = [i for i,_ in enumerate(proList)]

    nComp = len(comList)
    nProtein = len(proList)

    comSimMat = np.zeros((nComp,nComp), dtype=float)
    proSimMat = np.zeros((nProtein,nProtein), dtype=float)
    for row,i in enumerate(comList):
        for col,j in enumerate(comList):
            comSimMat[row][col] = (kernel[(i,j)]+kernel[(j,i)])/2

    for row,i in enumerate(proList):
        for col,j in enumerate(proList):
            proSimMat[row][col] = (kernel[(i,j)]+kernel[(j,i)])/2

    comSimMat = regularizationKernel(comSimMat)
    proSimMat = regularizationKernel(proSimMat)

    print "Clustering"
    comDisMat = kmedoid.simToDis(comSimMat)
    proDisMat = kmedoid.simToDis(proSimMat)

    _,proClust = kmedoid.kMedoids(len(proList)/2, proDisMat)
    _,comClust = kmedoid.kMedoids(len(comList)/2, comDisMat)

    print "Generate Negative Data"
    connMat = genNegativeData(connMat,proClust,comClust)
    # PLACEHOLDER

    # Split Data
    pairData = []
    connList = []
    print "Split Dataset..."
    if evalMode == "loocv":
        nFold = len(comListIdx)
        kSplit = KFold(n_splits=nFold,shuffle=True)
        comSplit = kSplit.split(comListIdx)

        nFold = len(proListIdx)
        kSplit = KFold(n_splits=nFold,shuffle=True)
        proSplit = kSplit.split(proListIdx)

    elif evalMode == "kfold":
        nFold = 10
        kSplit = KFold(n_splits=nFold, shuffle=True)
        comSplit = kSplit.split(comListIdx)
        proSplit = kSplit.split(proListIdx)

    else:
        assert(False)

    predictedData = np.zeros((len(comList),len(proList)),dtype=float)
    splitPred = []
    proTestList = []
    proTrainList = []
    comTestList = []
    comTrainList = []

    for trainIndex, testIndex in proSplit:
        proTestList.append([i for i in testIndex])
        proTrainList.append([i for i in trainIndex])
    for trainIndex, testIndex in comSplit:
        comTestList.append([i for i in testIndex])
        comTrainList.append([i for i in trainIndex])

    predRes = []
    testData = []

    print "Predicting..."
    for ii,i in enumerate(comTestList):
        for jj,j in enumerate(proTestList):
            sys.stdout.write("\r%03d of %03d||%03d of %03d" %
                                (jj+1, len(proTestList), ii+1,len(comTestList),))
            sys.stdout.flush()

            predictor = SELFBLM(classParam, connMat, comSimMat, proSimMat,
                            [comTrainList[ii],proTrainList[jj]],[i,j])
            for comp in i:
                for prot in j:
                    predRes.append(predictor.predict([(comp,prot)]))
                    if connMat[comp][prot] == 1:
                        testData.append(1)
                    else:
                        testData.append(-1)

    # run core selfBLM
    # Evaluate prediction
    print "\nCalculate Performance"
    key = 'PredictionUsingSelfBLM'
    precision, recall, _ = precision_recall_curve(testData, predRes)
    prAUC = average_precision_score(testData, predRes, average='micro')

    print "Visualiation"
    lineType = 'k-.'

    perf = {'precision': precision, 'recall': recall, 'prAUC': prAUC,
                 'lineType': lineType}
    perf2 = {'prAUC': prAUC, 'nTest': nComp*nProtein}

    with open(outPath+'perf_selfblm_'+evalMode+'_'+dataset+'_perf.json', 'w') as fp:
        json.dump(perf2, fp, indent=2, sort_keys=True)

    plt.clf()
    plt.figure()
    plt.plot(perf['recall'], perf['precision'], perf['lineType'], label= key+' (area = %0.2f)' % perf['prAUC'], lw=2)
    plt.ylim([-0.05, 1.05])
    plt.xlim([-0.05, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")
    plt.savefig(outPath+'/pr_curve_'+ dataset +'_'+evalMode+'_selfblm.png', bbox_inches='tight')
Ejemplo n.º 12
0
def main():
    if len(sys.argv)!=4:
        print "Usage: python kmedoid.py [e|ic|gpcr|nr] [dataDir] [outputDir]"
        return

    dataPath = sys.argv[1]
    dataset = sys.argv[2]
    outPath = sys.argv[3]

    # Load file
    print "Preparing data"
    _,comList,proList = yam.loadComProConnMat(dataset,dataPath+"/Adjacency")
    kernel = yam.loadKernel(dataset,dataPath)

    nComp = len(comList)
    nProtein = len(proList)

    comSimMat = np.zeros((nComp,nComp), dtype=float)
    proSimMat = np.zeros((nProtein,nProtein), dtype=float)

    for row,i in enumerate(comList):
        for col,j in enumerate(comList):
            comSimMat[row][col] = kernel[(i,j)]

    for row,i in enumerate(proList):
        for col,j in enumerate(proList):
            proSimMat[row][col] = kernel[(i,j)]

    # convert similarity matrix to distance Matrix
    proDisMat = simToDis(proSimMat)
    comDisMat = simToDis(comSimMat)

    print "Clustering"
    proMedoid,proClust = kMedoids(len(proList)/2, proDisMat)
    comMedoid,comClust = kMedoids(len(comList)/2, comDisMat)
    # Take each label for each sample
    comLabelList = np.zeros((nComp))
    proLabelList = np.zeros((nProtein))
    proMetaClust = dict()
    comMetaClust = dict()

    for lab in proClust:
        meta = []
        for idx in proClust[lab]:
            meta.append(proList[idx])
            proLabelList[idx] = lab
        proMetaClust[lab] = meta


    for lab in comClust:
        meta = []
        for idx in comClust[lab]:
            meta.append(comList[idx])
            comLabelList[idx] = lab
        comMetaClust[lab] = meta

    print "Evaluation"

    comSilhouette = met.silhouette_score(comDisMat,comLabelList,metric="precomputed")
    proSilhouette = met.silhouette_score(proDisMat,proLabelList,metric="precomputed")

    comCalinskiHarabaz = met.calinski_harabaz_score(comDisMat,comLabelList)
    proCalinskiHarabaz = met.calinski_harabaz_score(proDisMat,proLabelList)

    print ("Silhouette score :\nCompound cluster = "+str(comSilhouette)+
            ",Protein cluster = "+str(proSilhouette))

    print ("Calinski Harabaz score :\nCompound cluster = "+str(comCalinskiHarabaz)+
            ", Protein cluster = "+str(proCalinskiHarabaz))

    print "Writing Output"

    perf = {'silhouette_score_':{'compound':comSilhouette,'protein':proSilhouette},
            'calinski_harabaz_score':{'compound':comCalinskiHarabaz,'protein':
            proCalinskiHarabaz}}

    with open(outPath+"/perf_medoid_"+dataset+".json",'w') as f:
        json.dump(perf,f, indent=2, sort_keys=True)

    with open(outPath+"/cluster_medoid_com_"+dataset+".json",'w') as f:
        json.dump(comMetaClust,f, indent=2, sort_keys=True)

    with open(outPath+"/cluster_medoid_pro_"+dataset+".json",'w') as f:
        json.dump(proMetaClust,f, indent=2, sort_keys=True)