def main(): if len(sys.argv)!=6: print ("python blmniisvm_experiment.py [e|ic|gpcr|nr] [clustMethod] " "[dataPath] [clusterPath] [outPath]") return dataset = sys.argv[1] method = sys.argv[2] dataPath = sys.argv[3] clusterPath = sys.argv[4] outPath = sys.argv[5] print "Loading Adjacency" connMat,comList,proList = yam.loadComProConnMat(dataset,dataPath+"/Adjacency") nComp = len(comList) nProtein = len(proList) print "Loading Cluster" comClust = loadCluster(clusterPath+"/cluster_"+method+"_com_"+dataset+".json",comList) proClust = loadCluster(clusterPath+"/cluster_"+method+"_pro_"+dataset+".json",proList) print "Generate Negative Data" connMat = genNegativeData(connMat,proClust,comClust) print "Writing Output To "+outPath connMat = [[row[i] for row in connMat] for i in range(len(connMat[0]))] with open(outPath+"/admat_dgc_"+dataset+"_negative.txt",'w') as f: for i,c in enumerate(comList): if i>0: f.write(" ") f.write(str(c)) f.write("\n") for i,r in enumerate(connMat): f.write(proList[i].ljust(7)) for j,c in enumerate(r): f.write(" ") f.write(str(c)) f.write("\n") print "Stats: " unlabeled = 0 negative = 0 positive = 0 total = nComp*nProtein for i in connMat: for j in i: if j == 0: unlabeled += 1 elif j == -1: negative += 1 elif j == 1: positive += 1 print "Total Data: "+str(total) print "Positive Data: "+str(positive) print "Unlabeled Data: "+str(unlabeled) print "Negative Data: "+str(negative)
def main(argv): if len(argv) != 3: print 'USAGE: python devel.py [dataMode] [valMode]' return dataMode = argv[1] valMode = argv[2] # load development dataset, containing com-pro connectivity connMat, comList, proList = yam.loadComProConnMat(dataMode) kernel = yam.loadKernel(dataMode) ## dataX = [] dataY = [] for i, ii in enumerate(comList): for j, jj in enumerate(proList): dataX.append((ii, jj)) dataY.append(connMat[i][j]) nData = len(dataY) ## nFolds = None kfList = None if valMode == 'loocv': nFolds = nData kfList = KFold(nData, n_folds=nFolds, shuffle=True) elif valMode == 'kfcv': nFolds = 10 kfList = StratifiedKFold(dataY, n_folds=nFolds, shuffle=True) else: assert (False) kronrls = KronRLS(connMat, comList, proList, kernel) ## prep for parallel xTestList = [] yTestList = [] for trIdxList, testIdxList in kfList: xTest = [dataX[i] for i in testIdxList] yTest = [dataY[i] for i in testIdxList] xTestList.append(xTest) yTestList.append(yTest) ## yPredList = fu.map(evalPerFold, xTestList, yTestList, [kronrls] * nFolds, [connMat] * nFolds, [comList] * nFolds, [proList] * nFolds, [kernel] * nFolds)
def main(argv): if len(argv)!=3: print 'USAGE: python devel.py [dataMode] [valMode]' return dataMode = argv[1] valMode = argv[2] # load development dataset, containing com-pro connectivity connMat,comList,proList = yam.loadComProConnMat(dataMode) kernel = yam.loadKernel(dataMode) ## dataX = [] dataY = [] for i,ii in enumerate(comList): for j,jj in enumerate(proList): dataX.append( (ii,jj) ) dataY.append( connMat[i][j] ) nData = len(dataY) ## nFolds = None kfList = None if valMode=='loocv': nFolds = nData kfList = KFold(nData, n_folds=nFolds, shuffle=True) elif valMode=='kfcv': nFolds = 10 kfList = StratifiedKFold(dataY, n_folds=nFolds, shuffle=True) else: assert(False) kronrls = KronRLS(connMat,comList,proList,kernel) ## prep for parallel xTestList = [] yTestList = [] for trIdxList, testIdxList in kfList: xTest = [dataX[i] for i in testIdxList] yTest = [dataY[i] for i in testIdxList] xTestList.append(xTest) yTestList.append(yTest) ## yPredList = fu.map(evalPerFold,xTestList,yTestList,[kronrls]*nFolds, [connMat]*nFolds,[comList]*nFolds,[proList]*nFolds,[kernel]*nFolds)
def main(argv): if len(argv) != 3: print 'USAGE: ' print 'python devel.py [dataMode:e/nr/gpcr/ic] [valMode:loocv/kfcv]' return dataMode = argv[1] valMode = argv[2] # load development dataset, containing com-pro connectivity connMatDpath = '../../dataset/connectivity/compound_vs_protein/yamanishi/ground-truth' connMat, comList, proList = yam.loadComProConnMat(dataMode, connMatDpath) kernelDpath = '../../dataset/connectivity/compound_vs_protein/yamanishi/similarity-mat' kernel = yam.loadKernel(dataMode, kernelDpath) ## dataX = [] dataY = [] for i, ii in enumerate(comList): for j, jj in enumerate(proList): dataX.append((ii, jj)) dataY.append(connMat[i][j]) nData = len(dataY) print 'nData= ' + str(nData) ## instantiate a KronRLS predictor kronrls = KronRLS(cfg, connMat, comList, proList, kernel) ## nFolds = None kfList = None if valMode == 'loocv': nFolds = nData kf = KFold(n_splits=nFolds) kfList = kf.split(dataX) elif valMode == 'kfcv': nFolds = 10 skf = StratifiedKFold(n_splits=nFolds) kfList = skf.split(dataX, dataY) else: assert False, 'Unknown valMode' yTestList = [] yPredList = [] fold = 0 for trIdxList, testIdxList in kfList: fold += 1 print 'fold=', fold, 'of', nFolds, '######################################' xTest = [dataX[i] for i in testIdxList] yTest = [dataY[i] for i in testIdxList] # xTr = [dataX[i] for i in trIdxList] # yTr = [dataY[i] for i in trIdxList] # test yPred = kronrls.predict(xTest) yTestList += yTest yPredList += yPred ## print 'calculating aupr...' precision, recall, _ = precision_recall_curve(yTestList, yPredList) aupr = average_precision_score(yTestList, yPredList, average='micro') ## print 'plotting ...' plt.clf() plt.figure() plt.plot(recall, precision, 'r-', label='(area = %0.2f)' % aupr, lw=2) plt.ylim([-0.05, 1.05]) plt.xlim([-0.05, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall Curve of ' + dataMode + ' ' + valMode) plt.legend(loc="lower left") fname = 'pr_curve_' + dataMode + '_' + valMode + '.png' plt.savefig(outDir + fname, bbox_inches='tight')
def auprTest(): global classParam,comSimMat,proSimMat,connMat,comNetSim,proNetSim classParam = blmniiConfig classParam["proba"] = False classParam["data"] = "precomputed" classParam["kernel"] = ["precomputed","rbf"] classParam["retValue"] = "score" if len(sys.argv)!=7: print "Usage: python devel.py aupr [numCore] [DataSetCode] [evalMode] [dataPath] [outPath]" return core = int(sys.argv[2]) dataset = sys.argv[3] evalMode = sys.argv[4] dataPath = sys.argv[5] outPath = sys.argv[6] print "Building Data" connMat,comList,proList = yam.loadComProConnMat(dataset,dataPath+"/Adjacency") kernel = yam.loadKernel(dataset,dataPath) proListIdx = [i for i,_ in enumerate(proList)] comListIdx = [i for i,_ in enumerate(comList)] nComp = len(comList) nProtein = len(proList) comSimMat = np.zeros((nComp,nComp), dtype=float) proSimMat = np.zeros((nProtein,nProtein), dtype=float) for row,i in enumerate(comList): for col,j in enumerate(comList): comSimMat[row][col] = (kernel[(i,j)]+kernel[(j,i)])/2 for row,i in enumerate(proList): for col,j in enumerate(proList): proSimMat[row][col] = (kernel[(i,j)]+kernel[(j,i)])/2 comSimMat = {"precomputed":regularizationKernel(comSimMat)} proSimMat = {"precomputed":regularizationKernel(proSimMat)} pairData = [] connList = [] print "Split Dataset..." if evalMode == "loocv": nFold = len(comListIdx) kSplit = KFold(n_splits=nFold,shuffle=True) comSplit = kSplit.split(comListIdx) nFold = len(proListIdx) kSplit = KFold(n_splits=nFold,shuffle=True) proSplit = kSplit.split(proListIdx) elif evalMode == "kfold": nFold = 10 kSplit = KFold(n_splits=nFold, shuffle=True) comSplit = kSplit.split(comListIdx) proSplit = kSplit.split(proListIdx) else: assert(False) predictedData = np.zeros((len(comList),len(proList)),dtype=float) splitPred = [] proTestList = [] proTrainList = [] comTestList = [] comTrainList = [] for trainIndex, testIndex in proSplit: proTestList.append([i for i in testIndex]) proTrainList.append([i for i in trainIndex]) for trainIndex, testIndex in comSplit: comTestList.append([i for i in testIndex]) comTrainList.append([i for i in trainIndex]) if core == 1: predRes,testData = singleProc(comTrainList,proTrainList,comTestList,proTestList) elif core > 1: tempPred = BLMNII(classParam) tempPred.setAttr(connMat=connMat,comSimMat=comSimMat,proSimMat=proSimMat) comSimMat["rbf"] = regularizationKernel(tempPred.makeNetSim("com")) proSimMat["rbf"] = regularizationKernel(tempPred.makeNetSim("pro")) tempPred.setAttr(comSimMat=comSimMat,proSimMat=proSimMat) predRes,testData = parallelProc(core,comTrainList,proTrainList,comTestList,proTestList) else: print "Error: Invalid core processor number" return ##################################################################### print "\nCalculate Performance" key = 'BLM-NII' precision, recall, _ = precision_recall_curve(testData, predRes) prAUC = average_precision_score(testData, predRes, average='micro') print "Visualiation" lineType = 'k-.' perf = {'precision': precision, 'recall': recall, 'prAUC': prAUC, 'lineType': lineType} perf2 = {'prAUC': prAUC, 'nTest': nComp*nProtein} with open(outPath+'perf_blmnii_'+dataset +'_'+evalMode+'_'+str(classParam["alpha"])+'_'+str(classParam["gamma"])+'_perf.json', 'w') as fp: json.dump(perf2, fp, indent=2, sort_keys=True) plt.clf() plt.figure() plt.plot(perf['recall'], perf['precision'], perf['lineType'], label= key+' (area = %0.2f)' % perf['prAUC'], lw=2) plt.ylim([-0.05, 1.05]) plt.xlim([-0.05, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall Curve') plt.legend(loc="lower left") plt.savefig(outPath+'/'+ dataset +'_'+evalMode+'_'+str(classParam["alpha"])+'_'+str(classParam["gamma"])+'_pr_curve_.png', bbox_inches='tight')
def main(): if len(sys.argv) != 2: print 'USAGE:' print 'python cluster2.py [targetDir]' print '/param targetDir: dir containing one compound and one protein clustering results' return tdir = sys.argv[1] metrics = ['calinskiharabaz', 'silhouette'] modes = ['compound', 'protein'] ## print 'loading connMat...' dataset = tdir.split('/')[-1].split('-')[-1] dParam = dataset.split('#') dpath = os.path.join(DATASET_DIR, dParam[0], 'ground-truth') connMat, comList, proList = yam.loadComProConnMat(dParam[1], dpath) ## print 'loading compound and protein clusters...' def _loadCluster(mode, metric): dname = [i for i in os.listdir(tdir) if (mode in i)][0] dpath = os.path.join(tdir, dname) assert os.path.isdir(dpath) item2clusterlabel = dict() clusterlabel2item = defaultdict(list) fname = '_'.join([mode, metric, 'bestlabels.json']) with open(os.path.join(dpath, fname), 'r') as f: item2clusterlabel = yaml.load(f) for k, v in item2clusterlabel.iteritems(): clusterlabel2item[v].append(k) return { 'item2clusterlabel': item2clusterlabel, 'clusterlabel2item': clusterlabel2item } clusterData = {} for mode in modes: for metric in metrics: clusterData[(mode, metric)] = _loadCluster(mode, metric) ## def _getNumberOfConnBetweenComProClusters(comMetric, proMetric): nConnDict = {} for comCluster in clusterData[('compound', comMetric)]['clusterlabel2item']: if (comCluster == -1): continue for proCluster in clusterData[('protein', proMetric)]['clusterlabel2item']: if (proCluster == -1): continue nConn = 0 for com in clusterData[( 'compound', comMetric)]['clusterlabel2item'][comCluster]: for pro in clusterData[( 'protein', proMetric)]['clusterlabel2item'][proCluster]: conn = int( connMat[comList.index(com)][proList.index(pro)]) if conn == 1: nConn += 1 nConnDict[(comCluster, proCluster)] = nConn return nConnDict connAmongComProClusters = dict() for comMet in metrics: for proMet in metrics: print 'get clusterConn of ' + comMet + ' and ' + proMet connAmongComProClusters[( comMet, proMet)] = _getNumberOfConnBetweenComProClusters( comMet, proMet) ## for comMet in metrics: for proMet in metrics: print 'getting connMat2 of ' + comMet + ' and ' + proMet connMat2 = np.copy(connMat) for i in range(connMat.shape[0]): for j in range(connMat.shape[1]): if connMat[i][j] == 1: continue # because of known positive interaction comCluster = clusterData[( 'compound', comMet)]['item2clusterlabel'][comList[i]] proCluster = clusterData[( 'protein', proMet)]['item2clusterlabel'][proList[j]] if (comCluster == -1) or (proCluster == -1): continue # because of outlier cluster label if connAmongComProClusters[(comMet, proMet)][(comCluster, proCluster)] == 0: connMat2[i][j] = -1 connDict = defaultdict(list) connDict2 = defaultdict(list) connDictRaw = util.connMat2Dict(connMat2, comList, proList) for k, v in connDictRaw.iteritems(): connDict[int(v)].append(k) for k, v in connDict.iteritems(): connDict2[k].append(len(v)) summ = sum([v[0] for v in connDict2.values()]) for k, v in connDict2.iteritems(): connDict2[k].append(float(v[0]) / summ) ## tag = '_'.join([comMet, proMet]) # np.savetxt(os.path.join(tdir,tag+'_connMat.csv'),connMat2,delimiter=',') # with open(os.path.join(tdir,tag+"labels.json"),'w') as f: # json.dump(connDict,f,indent=2,sort_keys=True) with open(os.path.join(tdir, tag + "_labels.pkl"), 'w') as f: pickle.dump(connDict, f) with open(os.path.join(tdir, tag + "_labels_stat.json"), 'w') as f: json.dump(connDict2, f, indent=2, sort_keys=True)
def main(argv): if len(argv)!=3: print 'USAGE: ' print 'python devel.py [dataMode:e/nr/gpcr/ic] [valMode:loocv/kfcv]' return dataMode = argv[1] valMode = argv[2] # load development dataset, containing com-pro connectivity connMatDpath = '../../dataset/connectivity/compound_vs_protein/yamanishi/ground-truth' connMat,comList,proList = yam.loadComProConnMat(dataMode,connMatDpath) kernelDpath = '../../dataset/connectivity/compound_vs_protein/yamanishi/similarity-mat' kernel = yam.loadKernel(dataMode,kernelDpath) ## dataX = [] dataY = [] for i,ii in enumerate(comList): for j,jj in enumerate(proList): dataX.append( (ii,jj) ) dataY.append( connMat[i][j] ) nData = len(dataY) print 'nData= '+str(nData) ## instantiate a KronRLS predictor kronrls = KronRLS(cfg,connMat,comList,proList,kernel) ## nFolds = None kfList = None if valMode=='loocv': nFolds = nData kf = KFold(n_splits=nFolds) kfList = kf.split(dataX) elif valMode=='kfcv': nFolds = 10 skf = StratifiedKFold(n_splits=nFolds) kfList = skf.split(dataX,dataY) else: assert False,'Unknown valMode' yTestList = [] yPredList = [] fold = 0 for trIdxList, testIdxList in kfList: fold += 1 print 'fold=',fold,'of',nFolds,'######################################' xTest = [dataX[i] for i in testIdxList] yTest = [dataY[i] for i in testIdxList] # xTr = [dataX[i] for i in trIdxList] # yTr = [dataY[i] for i in trIdxList] # test yPred = kronrls.predict(xTest) yTestList += yTest yPredList += yPred ## print 'calculating aupr...' precision, recall, _ = precision_recall_curve(yTestList, yPredList) aupr = average_precision_score(yTestList, yPredList, average='micro') ## print 'plotting ...' plt.clf() plt.figure() plt.plot(recall, precision, 'r-', label= '(area = %0.2f)' % aupr, lw=2) plt.ylim([-0.05, 1.05]) plt.xlim([-0.05, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall Curve of '+dataMode+' '+valMode) plt.legend(loc="lower left") fname = 'pr_curve_'+dataMode+'_'+valMode+'.png' plt.savefig(outDir+fname, bbox_inches='tight')
def main(): if len(sys.argv) != 6: print( "python blmniisvm_experiment.py [e|ic|gpcr|nr] [clustMethod] " "[dataPath] [clusterPath] [outPath]") return dataset = sys.argv[1] method = sys.argv[2] dataPath = sys.argv[3] clusterPath = sys.argv[4] outPath = sys.argv[5] print "Loading Adjacency" connMat, comList, proList = yam.loadComProConnMat(dataset, dataPath + "/Adjacency") nComp = len(comList) nProtein = len(proList) print "Loading Cluster" comClust = loadCluster( clusterPath + "/cluster_" + method + "_com_" + dataset + ".json", comList) proClust = loadCluster( clusterPath + "/cluster_" + method + "_pro_" + dataset + ".json", proList) print "Generate Negative Data" connMat = genNegativeData(connMat, proClust, comClust) print "Writing Output To " + outPath connMat = [[row[i] for row in connMat] for i in range(len(connMat[0]))] with open(outPath + "/admat_dgc_" + dataset + "_negative.txt", 'w') as f: for i, c in enumerate(comList): if i > 0: f.write(" ") f.write(str(c)) f.write("\n") for i, r in enumerate(connMat): f.write(proList[i].ljust(7)) for j, c in enumerate(r): f.write(" ") f.write(str(c)) f.write("\n") print "Stats: " unlabeled = 0 negative = 0 positive = 0 total = nComp * nProtein for i in connMat: for j in i: if j == 0: unlabeled += 1 elif j == -1: negative += 1 elif j == 1: positive += 1 print "Total Data: " + str(total) print "Positive Data: " + str(positive) print "Unlabeled Data: " + str(unlabeled) print "Negative Data: " + str(negative)
def auprTest(): global classParam, comSimMat, proSimMat, connMat, comNetSim, proNetSim classParam = blmniiConfig classParam["proba"] = False classParam["data"] = "precomputed" classParam["kernel"] = ["precomputed", "rbf"] classParam["retValue"] = "score" if len(sys.argv) != 7: print "Usage: python devel.py aupr [numCore] [DataSetCode] [evalMode] [dataPath] [outPath]" return core = int(sys.argv[2]) dataset = sys.argv[3] evalMode = sys.argv[4] dataPath = sys.argv[5] outPath = sys.argv[6] print "Building Data" connMat, comList, proList = yam.loadComProConnMat(dataset, dataPath + "/Adjacency") kernel = yam.loadKernel(dataset, dataPath) proListIdx = [i for i, _ in enumerate(proList)] comListIdx = [i for i, _ in enumerate(comList)] nComp = len(comList) nProtein = len(proList) comSimMat = np.zeros((nComp, nComp), dtype=float) proSimMat = np.zeros((nProtein, nProtein), dtype=float) for row, i in enumerate(comList): for col, j in enumerate(comList): comSimMat[row][col] = (kernel[(i, j)] + kernel[(j, i)]) / 2 for row, i in enumerate(proList): for col, j in enumerate(proList): proSimMat[row][col] = (kernel[(i, j)] + kernel[(j, i)]) / 2 comSimMat = {"precomputed": regularizationKernel(comSimMat)} proSimMat = {"precomputed": regularizationKernel(proSimMat)} pairData = [] connList = [] print "Split Dataset..." if evalMode == "loocv": nFold = len(comListIdx) kSplit = KFold(n_splits=nFold, shuffle=True) comSplit = kSplit.split(comListIdx) nFold = len(proListIdx) kSplit = KFold(n_splits=nFold, shuffle=True) proSplit = kSplit.split(proListIdx) elif evalMode == "kfold": nFold = 10 kSplit = KFold(n_splits=nFold, shuffle=True) comSplit = kSplit.split(comListIdx) proSplit = kSplit.split(proListIdx) else: assert (False) predictedData = np.zeros((len(comList), len(proList)), dtype=float) splitPred = [] proTestList = [] proTrainList = [] comTestList = [] comTrainList = [] for trainIndex, testIndex in proSplit: proTestList.append([i for i in testIndex]) proTrainList.append([i for i in trainIndex]) for trainIndex, testIndex in comSplit: comTestList.append([i for i in testIndex]) comTrainList.append([i for i in trainIndex]) if core == 1: predRes, testData = singleProc(comTrainList, proTrainList, comTestList, proTestList) elif core > 1: tempPred = BLMNII(classParam) tempPred.setAttr(connMat=connMat, comSimMat=comSimMat, proSimMat=proSimMat) comSimMat["rbf"] = regularizationKernel(tempPred.makeNetSim("com")) proSimMat["rbf"] = regularizationKernel(tempPred.makeNetSim("pro")) tempPred.setAttr(comSimMat=comSimMat, proSimMat=proSimMat) predRes, testData = parallelProc(core, comTrainList, proTrainList, comTestList, proTestList) else: print "Error: Invalid core processor number" return ##################################################################### print "\nCalculate Performance" key = 'BLM-NII' precision, recall, _ = precision_recall_curve(testData, predRes) prAUC = average_precision_score(testData, predRes, average='micro') print "Visualiation" lineType = 'k-.' perf = { 'precision': precision, 'recall': recall, 'prAUC': prAUC, 'lineType': lineType } perf2 = {'prAUC': prAUC, 'nTest': nComp * nProtein} with open( outPath + 'perf_blmnii_' + dataset + '_' + evalMode + '_' + str(classParam["alpha"]) + '_' + str(classParam["gamma"]) + '_perf.json', 'w') as fp: json.dump(perf2, fp, indent=2, sort_keys=True) plt.clf() plt.figure() plt.plot(perf['recall'], perf['precision'], perf['lineType'], label=key + ' (area = %0.2f)' % perf['prAUC'], lw=2) plt.ylim([-0.05, 1.05]) plt.xlim([-0.05, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall Curve') plt.legend(loc="lower left") plt.savefig(outPath + '/' + dataset + '_' + evalMode + '_' + str(classParam["alpha"]) + '_' + str(classParam["gamma"]) + '_pr_curve_.png', bbox_inches='tight')
def main(): if len(sys.argv)!=2: print 'USAGE:' print 'python cluster2.py [targetDir]' print '/param targetDir: dir containing one compound and one protein clustering results' return tdir = sys.argv[1] metrics = ['calinskiharabaz','silhouette'] modes = ['compound','protein'] ## print 'loading connMat...' dataset = tdir.split('/')[-1].split('-')[-1] dParam = dataset.split('#') dpath = os.path.join(DATASET_DIR,dParam[0],'ground-truth') connMat,comList,proList = yam.loadComProConnMat(dParam[1],dpath) ## print 'loading compound and protein clusters...' def _loadCluster(mode,metric): dname = [i for i in os.listdir(tdir) if (mode in i)][0] dpath = os.path.join(tdir,dname); assert os.path.isdir(dpath) item2clusterlabel = dict(); clusterlabel2item = defaultdict(list) fname = '_'.join([mode,metric,'bestlabels.json']) with open(os.path.join(dpath,fname),'r') as f: item2clusterlabel = yaml.load(f) for k,v in item2clusterlabel.iteritems(): clusterlabel2item[v].append(k) return {'item2clusterlabel':item2clusterlabel,'clusterlabel2item':clusterlabel2item} clusterData = {} for mode in modes: for metric in metrics: clusterData[(mode,metric)] = _loadCluster(mode,metric) ## def _getNumberOfConnBetweenComProClusters(comMetric,proMetric): nConnDict = {} for comCluster in clusterData[('compound',comMetric)]['clusterlabel2item']: if (comCluster==-1): continue for proCluster in clusterData[('protein',proMetric)]['clusterlabel2item']: if (proCluster==-1): continue nConn = 0 for com in clusterData[('compound',comMetric)]['clusterlabel2item'][comCluster]: for pro in clusterData[('protein',proMetric)]['clusterlabel2item'][proCluster]: conn = int( connMat[comList.index(com)][proList.index(pro)] ) if conn==1: nConn += 1 nConnDict[(comCluster,proCluster)] = nConn return nConnDict connAmongComProClusters = dict() for comMet in metrics: for proMet in metrics: print 'get clusterConn of '+comMet+' and '+proMet connAmongComProClusters[(comMet,proMet)] = _getNumberOfConnBetweenComProClusters(comMet,proMet) ## for comMet in metrics: for proMet in metrics: print 'getting connMat2 of '+comMet+' and '+proMet connMat2 = np.copy(connMat) for i in range(connMat.shape[0]): for j in range(connMat.shape[1]): if connMat[i][j]==1: continue # because of known positive interaction comCluster = clusterData[('compound',comMet)]['item2clusterlabel'][ comList[i] ] proCluster = clusterData[('protein',proMet)]['item2clusterlabel'][ proList[j] ] if (comCluster==-1)or(proCluster==-1): continue # because of outlier cluster label if connAmongComProClusters[(comMet,proMet)][(comCluster,proCluster)]==0: connMat2[i][j] = -1 connDict = defaultdict(list); connDict2 = defaultdict(list) connDictRaw = util.connMat2Dict(connMat2,comList,proList) for k,v in connDictRaw.iteritems(): connDict[int(v)].append(k) for k,v in connDict.iteritems(): connDict2[k].append(len(v)) summ = sum([v[0] for v in connDict2.values()]) for k,v in connDict2.iteritems(): connDict2[k].append(float(v[0])/summ) ## tag = '_'.join([comMet,proMet]) # np.savetxt(os.path.join(tdir,tag+'_connMat.csv'),connMat2,delimiter=',') # with open(os.path.join(tdir,tag+"labels.json"),'w') as f: # json.dump(connDict,f,indent=2,sort_keys=True) with open(os.path.join(tdir,tag+"_labels.pkl"),'w') as f: pickle.dump(connDict,f) with open(os.path.join(tdir,tag+"_labels_stat.json"),'w') as f: json.dump(connDict2,f,indent=2,sort_keys=True)
def main(): if len(sys.argv)!=5: print ("python blmniisvm_experiment.py [DataSetCode] [evalMode]" " [dataPath] [outPath]") return classParam = dict(name='blmnii',proba=True) dataset = sys.argv[1] evalMode = sys.argv[2] dataPath = sys.argv[3] outPath = sys.argv[4] print "Building Data" connMat,comList,proList = yam.loadComProConnMat(dataset,dataPath+"/Adjacency") kernel = yam.loadKernel(dataset,dataPath) comListIdx = [i for i,_ in enumerate(comList)] proListIdx = [i for i,_ in enumerate(proList)] nComp = len(comList) nProtein = len(proList) comSimMat = np.zeros((nComp,nComp), dtype=float) proSimMat = np.zeros((nProtein,nProtein), dtype=float) for row,i in enumerate(comList): for col,j in enumerate(comList): comSimMat[row][col] = (kernel[(i,j)]+kernel[(j,i)])/2 for row,i in enumerate(proList): for col,j in enumerate(proList): proSimMat[row][col] = (kernel[(i,j)]+kernel[(j,i)])/2 comSimMat = regularizationKernel(comSimMat) proSimMat = regularizationKernel(proSimMat) print "Clustering" comDisMat = kmedoid.simToDis(comSimMat) proDisMat = kmedoid.simToDis(proSimMat) _,proClust = kmedoid.kMedoids(len(proList)/2, proDisMat) _,comClust = kmedoid.kMedoids(len(comList)/2, comDisMat) print "Generate Negative Data" connMat = genNegativeData(connMat,proClust,comClust) # PLACEHOLDER # Split Data pairData = [] connList = [] print "Split Dataset..." if evalMode == "loocv": nFold = len(comListIdx) kSplit = KFold(n_splits=nFold,shuffle=True) comSplit = kSplit.split(comListIdx) nFold = len(proListIdx) kSplit = KFold(n_splits=nFold,shuffle=True) proSplit = kSplit.split(proListIdx) elif evalMode == "kfold": nFold = 10 kSplit = KFold(n_splits=nFold, shuffle=True) comSplit = kSplit.split(comListIdx) proSplit = kSplit.split(proListIdx) else: assert(False) predictedData = np.zeros((len(comList),len(proList)),dtype=float) splitPred = [] proTestList = [] proTrainList = [] comTestList = [] comTrainList = [] for trainIndex, testIndex in proSplit: proTestList.append([i for i in testIndex]) proTrainList.append([i for i in trainIndex]) for trainIndex, testIndex in comSplit: comTestList.append([i for i in testIndex]) comTrainList.append([i for i in trainIndex]) predRes = [] testData = [] print "Predicting..." for ii,i in enumerate(comTestList): for jj,j in enumerate(proTestList): sys.stdout.write("\r%03d of %03d||%03d of %03d" % (jj+1, len(proTestList), ii+1,len(comTestList),)) sys.stdout.flush() predictor = SELFBLM(classParam, connMat, comSimMat, proSimMat, [comTrainList[ii],proTrainList[jj]],[i,j]) for comp in i: for prot in j: predRes.append(predictor.predict([(comp,prot)])) if connMat[comp][prot] == 1: testData.append(1) else: testData.append(-1) # run core selfBLM # Evaluate prediction print "\nCalculate Performance" key = 'PredictionUsingSelfBLM' precision, recall, _ = precision_recall_curve(testData, predRes) prAUC = average_precision_score(testData, predRes, average='micro') print "Visualiation" lineType = 'k-.' perf = {'precision': precision, 'recall': recall, 'prAUC': prAUC, 'lineType': lineType} perf2 = {'prAUC': prAUC, 'nTest': nComp*nProtein} with open(outPath+'perf_selfblm_'+evalMode+'_'+dataset+'_perf.json', 'w') as fp: json.dump(perf2, fp, indent=2, sort_keys=True) plt.clf() plt.figure() plt.plot(perf['recall'], perf['precision'], perf['lineType'], label= key+' (area = %0.2f)' % perf['prAUC'], lw=2) plt.ylim([-0.05, 1.05]) plt.xlim([-0.05, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall Curve') plt.legend(loc="lower left") plt.savefig(outPath+'/pr_curve_'+ dataset +'_'+evalMode+'_selfblm.png', bbox_inches='tight')
def main(): if len(sys.argv)!=4: print "Usage: python kmedoid.py [e|ic|gpcr|nr] [dataDir] [outputDir]" return dataPath = sys.argv[1] dataset = sys.argv[2] outPath = sys.argv[3] # Load file print "Preparing data" _,comList,proList = yam.loadComProConnMat(dataset,dataPath+"/Adjacency") kernel = yam.loadKernel(dataset,dataPath) nComp = len(comList) nProtein = len(proList) comSimMat = np.zeros((nComp,nComp), dtype=float) proSimMat = np.zeros((nProtein,nProtein), dtype=float) for row,i in enumerate(comList): for col,j in enumerate(comList): comSimMat[row][col] = kernel[(i,j)] for row,i in enumerate(proList): for col,j in enumerate(proList): proSimMat[row][col] = kernel[(i,j)] # convert similarity matrix to distance Matrix proDisMat = simToDis(proSimMat) comDisMat = simToDis(comSimMat) print "Clustering" proMedoid,proClust = kMedoids(len(proList)/2, proDisMat) comMedoid,comClust = kMedoids(len(comList)/2, comDisMat) # Take each label for each sample comLabelList = np.zeros((nComp)) proLabelList = np.zeros((nProtein)) proMetaClust = dict() comMetaClust = dict() for lab in proClust: meta = [] for idx in proClust[lab]: meta.append(proList[idx]) proLabelList[idx] = lab proMetaClust[lab] = meta for lab in comClust: meta = [] for idx in comClust[lab]: meta.append(comList[idx]) comLabelList[idx] = lab comMetaClust[lab] = meta print "Evaluation" comSilhouette = met.silhouette_score(comDisMat,comLabelList,metric="precomputed") proSilhouette = met.silhouette_score(proDisMat,proLabelList,metric="precomputed") comCalinskiHarabaz = met.calinski_harabaz_score(comDisMat,comLabelList) proCalinskiHarabaz = met.calinski_harabaz_score(proDisMat,proLabelList) print ("Silhouette score :\nCompound cluster = "+str(comSilhouette)+ ",Protein cluster = "+str(proSilhouette)) print ("Calinski Harabaz score :\nCompound cluster = "+str(comCalinskiHarabaz)+ ", Protein cluster = "+str(proCalinskiHarabaz)) print "Writing Output" perf = {'silhouette_score_':{'compound':comSilhouette,'protein':proSilhouette}, 'calinski_harabaz_score':{'compound':comCalinskiHarabaz,'protein': proCalinskiHarabaz}} with open(outPath+"/perf_medoid_"+dataset+".json",'w') as f: json.dump(perf,f, indent=2, sort_keys=True) with open(outPath+"/cluster_medoid_com_"+dataset+".json",'w') as f: json.dump(comMetaClust,f, indent=2, sort_keys=True) with open(outPath+"/cluster_medoid_pro_"+dataset+".json",'w') as f: json.dump(proMetaClust,f, indent=2, sort_keys=True)